Message ID | 20210419233607.916848-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v5,1/2] x86: Optimize strlen-evex.S | expand |
On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > mostly small things but they add up to roughly 10-30% performance > improvement for strlen. The results for strnlen are bit more > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > 2 files changed, 334 insertions(+), 214 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c377cab629..651b32908e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > IFUNC_IMPL_ADD (array, i, strlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strlen_avx2) > IFUNC_IMPL_ADD (array, i, strlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strlen, > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > IFUNC_IMPL (i, name, strnlen, > IFUNC_IMPL_ADD (array, i, strnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strnlen_avx2) > IFUNC_IMPL_ADD (array, i, strnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strnlen, > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > IFUNC_IMPL (i, name, wcslen, > IFUNC_IMPL_ADD (array, i, wcslen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcslen_avx2) > IFUNC_IMPL_ADD (array, i, wcslen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcslen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcslen, > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > IFUNC_IMPL (i, name, wcsnlen, > IFUNC_IMPL_ADD (array, i, wcsnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcsnlen_avx2) > IFUNC_IMPL_ADD (array, i, wcsnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcsnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcsnlen, > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > index 1caae9e6bc..bd2e6ee44a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > @@ -27,9 +27,11 @@ > # ifdef USE_AS_WCSLEN > # define VPCMPEQ vpcmpeqd > # define VPMINU vpminud > +# define CHAR_SIZE 4 > # else > # define VPCMPEQ vpcmpeqb > # define VPMINU vpminub > +# define CHAR_SIZE 1 > # endif > > # ifndef VZEROUPPER > @@ -41,349 +43,459 @@ > # endif > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > > .section SECTION(.text),"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > + mov %RSI_LP, %R8_LP > # ifdef USE_AS_WCSLEN > shl $2, %RSI_LP > # elif defined __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > - mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > + movl %edi, %eax > movq %rdi, %rdx > vpxor %xmm0, %xmm0, %xmm0 > - > + /* Clear high bits from edi. Only keeping bits relevant to page > + cross check. */ > + andl $(PAGE_SIZE - 1), %eax > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - > + VPCMPEQ (%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < VEC_SIZE handle special. */ > + cmpq $VEC_SIZE, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > + /* If empty continue to aligned_more. Otherwise return bit > + position of first match. */ > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Set bit for max len so that tzcnt will return min of max len > + and position of first match. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bytes. */ > - sarl %cl, %eax > - testl %eax, %eax > - jz L(aligned_more) > +L(first_vec_x1): > tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 4 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + incl %edi > + addl %edi, %eax > # endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + shrl $2, %eax > # endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > + VZEROUPPER_RETURN > > .p2align 4 > -L(aligned_more): > +L(first_vec_x2): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 3 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > - addq $VEC_SIZE, %rdi > + .p2align 4 > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 2 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 2 + 1), %edi > + addl %edi, %eax > +# endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 3 + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > -L(more_4x_vec): > + .p2align 5 > +L(aligned_more): > + /* Align data to VEC_SIZE - 1. This is the same number of > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > + code on the x4 check. */ > + orq $(VEC_SIZE - 1), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > +# ifdef USE_AS_STRNLEN > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > + it simplies the logic in last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > + subq %rdx, %rcx > +# endif > + /* Load first VEC regardless. */ > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x2) > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > + cmpq $(VEC_SIZE * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + incq %rdi > + movl %edi, %ecx > + orq $(VEC_SIZE * 4 - 1), %rdi > + andl $(VEC_SIZE * 4 - 1), %ecx > + /* Readjust length. */ > addq %rcx, %rsi > +# else > + incq %rdi > + orq $(VEC_SIZE * 4 - 1), %rdi > # endif > - > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - VPMINU %ymm1, %ymm2, %ymm5 > - VPMINU %ymm3, %ymm4, %ymm6 > - VPMINU %ymm5, %ymm6, %ymm5 > - > - VPCMPEQ %ymm5, %ymm0, %ymm5 > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jnz L(4x_vec_end) > - > - addq $(VEC_SIZE * 4), %rdi > - > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > - > -L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + /* Save some code size by microfusing VPMINU with the load. Since > + the matches in ymm2/ymm4 can only be returned if there where no > + matches in ymm1/ymm3 respectively there is no issue with overlap. > + */ > + vmovdqa 1(%rdi), %ymm1 > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > + > + VPMINU %ymm2, %ymm4, %ymm5 > + VPCMPEQ %ymm5, %ymm0, %ymm5 > + vpmovmskb %ymm5, %ecx > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > + jz L(loop_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm1, %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + subq %rdx, %rdi > testl %eax, %eax > + jnz L(last_vec_return_x0) > > - jnz L(first_vec_x2_check) > - subl $VEC_SIZE, %esi > - jle L(max) > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm2, %ymm0, %ymm2 > + vpmovmskb %ymm2, %eax > testl %eax, %eax > - > - jnz L(first_vec_x3_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > + jnz L(last_vec_return_x1) > + > + /* Combine last 2 VEC. */ > + VPCMPEQ %ymm3, %ymm0, %ymm3 > + vpmovmskb %ymm3, %eax > + /* rcx has combined result from all 4 VEC. It will only be used if > + the first 3 other VEC all did not contain a match. */ > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > + subq $(VEC_SIZE * 2 - 1), %rdi > + addq %rdi, %rax > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > + > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > + subq $-(VEC_SIZE * 4), %rdi > +L(last_4x_vec_or_less_cmpeq): > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +L(last_4x_vec_or_less): > > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + vpmovmskb %ymm1, %eax > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > + VEC_SIZE * 4. */ > + testl $(VEC_SIZE * 2), %esi > + jnz L(last_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + /* length may have been negative or positive by an offset of > + VEC_SIZE * 4 depending on where this was called from. This fixes > + that. */ > + andl $(VEC_SIZE * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - VZEROUPPER_RETURN > + jnz L(last_vec_x1_check) > > - .p2align 4 > -L(first_vec_x0_check): > + subl $VEC_SIZE, %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > +# endif > > .p2align 4 > -L(first_vec_x1_check): > +L(last_vec_return_x0): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > + subq $(VEC_SIZE * 4 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_return_x1): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > + subq $(VEC_SIZE * 3 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x1_check): > + > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > L(max): > movq %r8, %rax > + VZEROUPPER_RETURN > + > + .p2align 4 > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x2) > + > + /* Normalize length. */ > + andl $(VEC_SIZE * 4 - 1), %esi > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x3) > + > + subl $(VEC_SIZE * 3), %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 3 + 1), %eax > + addq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > -# endif > > .p2align 4 > -L(first_vec_x0): > +L(last_vec_x1): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x1): > +L(last_vec_x2): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > - addq $VEC_SIZE, %rax > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2): > +L(last_vec_x3): > tzcntl %eax, %eax > - addq $(VEC_SIZE * 2), %rax > + subl $(VEC_SIZE * 2), %esi > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 2 + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > + VZEROUPPER_RETURN > +L(max_end): > + movq %r8, %rax > VZEROUPPER_RETURN > +# endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(4x_vec_end): > - VPCMPEQ %ymm1, %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMPEQ %ymm2, %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > +L(cross_page_boundary): > + /* Align data to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > + so no need to manually mod rdx. */ > + sarxl %edx, %eax, %eax > +# ifdef USE_AS_STRNLEN > testl %eax, %eax > - jnz L(first_vec_x1) > - VPCMPEQ %ymm3, %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > + jnz L(cross_page_less_vec) > + leaq 1(%rdi), %rcx > + subq %rdx, %rcx > + /* Check length. */ > + cmpq %rsi, %rcx > + jb L(cross_page_continue) > + movq %r8, %rax > +# else > testl %eax, %eax > - jnz L(first_vec_x2) > - VPCMPEQ %ymm4, %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > -L(first_vec_x3): > + jz L(cross_page_continue) > tzcntl %eax, %eax > - addq $(VEC_SIZE * 3), %rax > - addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > # endif > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > + > +# ifdef USE_AS_STRNLEN > + .p2align 4 > +L(cross_page_less_vec): > + tzcntl %eax, %eax > + cmpq %rax, %rsi > + cmovb %esi, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > VZEROUPPER_RETURN > +# endif > > END (STRLEN) > #endif > -- > 2.29.2 > LGTM. I am checking it in for you. Thanks.
On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > No bug. This commit optimizes strlen-avx2.S. The optimizations are > mostly small things but they add up to roughly 10-30% performance > improvement for strlen. The results for strnlen are bit more > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > 2 files changed, 334 insertions(+), 214 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c377cab629..651b32908e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > IFUNC_IMPL (i, name, strlen, > IFUNC_IMPL_ADD (array, i, strlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strlen_avx2) > IFUNC_IMPL_ADD (array, i, strlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strlen, > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > IFUNC_IMPL (i, name, strnlen, > IFUNC_IMPL_ADD (array, i, strnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __strnlen_avx2) > IFUNC_IMPL_ADD (array, i, strnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __strnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, strnlen, > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > IFUNC_IMPL (i, name, wcslen, > IFUNC_IMPL_ADD (array, i, wcslen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcslen_avx2) > IFUNC_IMPL_ADD (array, i, wcslen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcslen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcslen, > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > IFUNC_IMPL (i, name, wcsnlen, > IFUNC_IMPL_ADD (array, i, wcsnlen, > - CPU_FEATURE_USABLE (AVX2), > + (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2)), > __wcsnlen_avx2) > IFUNC_IMPL_ADD (array, i, wcsnlen, > (CPU_FEATURE_USABLE (AVX2) > + && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (RTM)), > __wcsnlen_avx2_rtm) > IFUNC_IMPL_ADD (array, i, wcsnlen, > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > index 1caae9e6bc..bd2e6ee44a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > @@ -27,9 +27,11 @@ > # ifdef USE_AS_WCSLEN > # define VPCMPEQ vpcmpeqd > # define VPMINU vpminud > +# define CHAR_SIZE 4 > # else > # define VPCMPEQ vpcmpeqb > # define VPMINU vpminub > +# define CHAR_SIZE 1 > # endif > > # ifndef VZEROUPPER > @@ -41,349 +43,459 @@ > # endif > > # define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > > .section SECTION(.text),"ax",@progbits > ENTRY (STRLEN) > # ifdef USE_AS_STRNLEN > - /* Check for zero length. */ > + /* Check zero length. */ > test %RSI_LP, %RSI_LP > jz L(zero) > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > + mov %RSI_LP, %R8_LP > # ifdef USE_AS_WCSLEN > shl $2, %RSI_LP > # elif defined __ILP32__ > /* Clear the upper 32 bits. */ > movl %esi, %esi > # endif > - mov %RSI_LP, %R8_LP > # endif > - movl %edi, %ecx > + movl %edi, %eax > movq %rdi, %rdx > vpxor %xmm0, %xmm0, %xmm0 > - > + /* Clear high bits from edi. Only keeping bits relevant to page > + cross check. */ > + andl $(PAGE_SIZE - 1), %eax > /* Check if we may cross page boundary with one vector load. */ > - andl $(2 * VEC_SIZE - 1), %ecx > - cmpl $VEC_SIZE, %ecx > - ja L(cros_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - > + VPCMPEQ (%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > # ifdef USE_AS_STRNLEN > - jnz L(first_vec_x0_check) > - /* Adjust length and check the end of data. */ > - subq $VEC_SIZE, %rsi > - jbe L(max) > -# else > - jnz L(first_vec_x0) > + /* If length < VEC_SIZE handle special. */ > + cmpq $VEC_SIZE, %rsi > + jbe L(first_vec_x0) > # endif > - > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > + /* If empty continue to aligned_more. Otherwise return bit > + position of first match. */ > + testl %eax, %eax > + jz L(aligned_more) > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > - addq %rcx, %rsi > +L(zero): > + xorl %eax, %eax > + ret > > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + .p2align 4 > +L(first_vec_x0): > + /* Set bit for max len so that tzcnt will return min of max len > + and position of first match. */ > + btsq %rsi, %rax > + tzcntl %eax, %eax > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > # endif > - jmp L(more_4x_vec) > > .p2align 4 > -L(cros_page_boundary): > - andl $(VEC_SIZE - 1), %ecx > - andq $-VEC_SIZE, %rdi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bytes. */ > - sarl %cl, %eax > - testl %eax, %eax > - jz L(aligned_more) > +L(first_vec_x1): > tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 4 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + incl %edi > + addl %edi, %eax > # endif > - addq %rdi, %rax > - addq %rcx, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > - shrq $2, %rax > + shrl $2, %eax > # endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > + VZEROUPPER_RETURN > > .p2align 4 > -L(aligned_more): > +L(first_vec_x2): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > - to void possible addition overflow. */ > - negq %rcx > - addq $VEC_SIZE, %rcx > - > - /* Check the end of data. */ > - subq %rcx, %rsi > - jbe L(max) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 3 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > - addq $VEC_SIZE, %rdi > + .p2align 4 > +L(first_vec_x3): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > +# ifdef USE_AS_STRNLEN > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE * 2 + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 2 + 1), %edi > + addl %edi, %eax > +# endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > + .p2align 4 > +L(first_vec_x4): > + tzcntl %eax, %eax > + /* Safe to use 32 bit instructions as these are only called for > + size = [1, 159]. */ > # ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > + /* Use ecx which was computed earlier to compute correct value. > + */ > + subl $(VEC_SIZE + 1), %ecx > + addl %ecx, %eax > +# else > + subl %edx, %edi > + addl $(VEC_SIZE * 3 + 1), %edi > + addl %edi, %eax > # endif > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > +# endif > + VZEROUPPER_RETURN > > -L(more_4x_vec): > + .p2align 5 > +L(aligned_more): > + /* Align data to VEC_SIZE - 1. This is the same number of > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > + code on the x4 check. */ > + orq $(VEC_SIZE - 1), %rdi > +L(cross_page_continue): > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > since data is only aligned to VEC_SIZE. */ > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > +# ifdef USE_AS_STRNLEN > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > + it simplies the logic in last_4x_vec_or_less. */ > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > + subq %rdx, %rcx > +# endif > + /* Load first VEC regardless. */ > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +# ifdef USE_AS_STRNLEN > + /* Adjust length. If near end handle specially. */ > + subq %rcx, %rsi > + jb L(last_4x_vec_or_less) > +# endif > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x2) > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > testl %eax, %eax > jnz L(first_vec_x3) > > - addq $(VEC_SIZE * 4), %rdi > - > -# ifdef USE_AS_STRNLEN > - subq $(VEC_SIZE * 4), %rsi > - jbe L(last_4x_vec_or_less) > -# endif > - > - /* Align data to 4 * VEC_SIZE. */ > - movq %rdi, %rcx > - andl $(4 * VEC_SIZE - 1), %ecx > - andq $-(4 * VEC_SIZE), %rdi > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > # ifdef USE_AS_STRNLEN > - /* Adjust length. */ > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > + cmpq $(VEC_SIZE * 4 - 1), %rsi > + jbe L(last_4x_vec_or_less_load) > + incq %rdi > + movl %edi, %ecx > + orq $(VEC_SIZE * 4 - 1), %rdi > + andl $(VEC_SIZE * 4 - 1), %ecx > + /* Readjust length. */ > addq %rcx, %rsi > +# else > + incq %rdi > + orq $(VEC_SIZE * 4 - 1), %rdi > # endif > - > + /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - VPMINU %ymm1, %ymm2, %ymm5 > - VPMINU %ymm3, %ymm4, %ymm6 > - VPMINU %ymm5, %ymm6, %ymm5 > - > - VPCMPEQ %ymm5, %ymm0, %ymm5 > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jnz L(4x_vec_end) > - > - addq $(VEC_SIZE * 4), %rdi > - > -# ifndef USE_AS_STRNLEN > - jmp L(loop_4x_vec) > -# else > +# ifdef USE_AS_STRNLEN > + /* Break if at end of length. */ > subq $(VEC_SIZE * 4), %rsi > - ja L(loop_4x_vec) > - > -L(last_4x_vec_or_less): > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > - addl $(VEC_SIZE * 2), %esi > - jle L(last_2x_vec) > + jb L(last_4x_vec_or_less_cmpeq) > +# endif > + /* Save some code size by microfusing VPMINU with the load. Since > + the matches in ymm2/ymm4 can only be returned if there where no > + matches in ymm1/ymm3 respectively there is no issue with overlap. > + */ > + vmovdqa 1(%rdi), %ymm1 > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > + > + VPMINU %ymm2, %ymm4, %ymm5 > + VPCMPEQ %ymm5, %ymm0, %ymm5 > + vpmovmskb %ymm5, %ecx > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > + jz L(loop_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm1, %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + subq %rdx, %rdi > testl %eax, %eax > + jnz L(last_vec_return_x0) > > - jnz L(first_vec_x2_check) > - subl $VEC_SIZE, %esi > - jle L(max) > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm2, %ymm0, %ymm2 > + vpmovmskb %ymm2, %eax > testl %eax, %eax > - > - jnz L(first_vec_x3_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > + jnz L(last_vec_return_x1) > + > + /* Combine last 2 VEC. */ > + VPCMPEQ %ymm3, %ymm0, %ymm3 > + vpmovmskb %ymm3, %eax > + /* rcx has combined result from all 4 VEC. It will only be used if > + the first 3 other VEC all did not contain a match. */ > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > + subq $(VEC_SIZE * 2 - 1), %rdi > + addq %rdi, %rax > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > + > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(last_2x_vec): > - addl $(VEC_SIZE * 2), %esi > - VPCMPEQ (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > +L(last_4x_vec_or_less_load): > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > + subq $-(VEC_SIZE * 4), %rdi > +L(last_4x_vec_or_less_cmpeq): > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > +L(last_4x_vec_or_less): > > - jnz L(first_vec_x0_check) > - subl $VEC_SIZE, %esi > - jle L(max) > + vpmovmskb %ymm1, %eax > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > + VEC_SIZE * 4. */ > + testl $(VEC_SIZE * 2), %esi > + jnz L(last_4x_vec) > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > + /* length may have been negative or positive by an offset of > + VEC_SIZE * 4 depending on where this was called from. This fixes > + that. */ > + andl $(VEC_SIZE * 4 - 1), %esi > testl %eax, %eax > - jnz L(first_vec_x1_check) > - movq %r8, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - VZEROUPPER_RETURN > + jnz L(last_vec_x1_check) > > - .p2align 4 > -L(first_vec_x0_check): > + subl $VEC_SIZE, %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > +# endif > > .p2align 4 > -L(first_vec_x1_check): > +L(last_vec_return_x0): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $VEC_SIZE, %rax > + subq $(VEC_SIZE * 4 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2_check): > +L(last_vec_return_x1): > tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 2), %rax > + subq $(VEC_SIZE * 3 - 1), %rdi > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > +# ifdef USE_AS_STRNLEN > .p2align 4 > -L(first_vec_x3_check): > +L(last_vec_x1_check): > + > tzcntl %eax, %eax > /* Check the end of data. */ > - cmpq %rax, %rsi > - jbe L(max) > - addq $(VEC_SIZE * 3), %rax > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > L(max): > movq %r8, %rax > + VZEROUPPER_RETURN > + > + .p2align 4 > +L(last_4x_vec): > + /* Test first 2x VEC normally. */ > + testl %eax, %eax > + jnz L(last_vec_x1) > + > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x2) > + > + /* Normalize length. */ > + andl $(VEC_SIZE * 4 - 1), %esi > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + testl %eax, %eax > + jnz L(last_vec_x3) > + > + subl $(VEC_SIZE * 3), %esi > + jb L(max) > + > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + tzcntl %eax, %eax > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 3 + 1), %eax > + addq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > -# endif > > .p2align 4 > -L(first_vec_x0): > +L(last_vec_x1): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > + subq %rdx, %rdi > + incl %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x1): > +L(last_vec_x2): > + /* essentially duplicates of first_vec_x1 but use 64 bit > + instructions. */ > tzcntl %eax, %eax > - addq $VEC_SIZE, %rax > + subq %rdx, %rdi > + addl $(VEC_SIZE + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > VZEROUPPER_RETURN > > .p2align 4 > -L(first_vec_x2): > +L(last_vec_x3): > tzcntl %eax, %eax > - addq $(VEC_SIZE * 2), %rax > + subl $(VEC_SIZE * 2), %esi > + /* Check the end of data. */ > + cmpl %eax, %esi > + jb L(max_end) > + subq %rdx, %rdi > + addl $(VEC_SIZE * 2 + 1), %eax > addq %rdi, %rax > - subq %rdx, %rax > -# ifdef USE_AS_WCSLEN > +# ifdef USE_AS_WCSLEN > shrq $2, %rax > -# endif > +# endif > + VZEROUPPER_RETURN > +L(max_end): > + movq %r8, %rax > VZEROUPPER_RETURN > +# endif > > + /* Cold case for crossing page with first load. */ > .p2align 4 > -L(4x_vec_end): > - VPCMPEQ %ymm1, %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - VPCMPEQ %ymm2, %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > +L(cross_page_boundary): > + /* Align data to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %eax > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > + so no need to manually mod rdx. */ > + sarxl %edx, %eax, %eax This is a BMI2 instruction, which is not necessary available when AVX2 is available. This causes SIGILL on some CPU. I have reported that in https://sourceware.org/bugzilla/show_bug.cgi?id=29611 Regards Aurelien
On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > mostly small things but they add up to roughly 10-30% performance > > improvement for strlen. The results for strnlen are bit more > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > are all passing. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index c377cab629..651b32908e 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > IFUNC_IMPL (i, name, strlen, > > IFUNC_IMPL_ADD (array, i, strlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __strlen_avx2) > > IFUNC_IMPL_ADD (array, i, strlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __strlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, strlen, > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > IFUNC_IMPL (i, name, strnlen, > > IFUNC_IMPL_ADD (array, i, strnlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __strnlen_avx2) > > IFUNC_IMPL_ADD (array, i, strnlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __strnlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, strnlen, > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > IFUNC_IMPL (i, name, wcslen, > > IFUNC_IMPL_ADD (array, i, wcslen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __wcslen_avx2) > > IFUNC_IMPL_ADD (array, i, wcslen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __wcslen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, wcslen, > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > IFUNC_IMPL (i, name, wcsnlen, > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > - CPU_FEATURE_USABLE (AVX2), > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2)), > > __wcsnlen_avx2) > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (RTM)), > > __wcsnlen_avx2_rtm) > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > index 1caae9e6bc..bd2e6ee44a 100644 > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > @@ -27,9 +27,11 @@ > > # ifdef USE_AS_WCSLEN > > # define VPCMPEQ vpcmpeqd > > # define VPMINU vpminud > > +# define CHAR_SIZE 4 > > # else > > # define VPCMPEQ vpcmpeqb > > # define VPMINU vpminub > > +# define CHAR_SIZE 1 > > # endif > > > > # ifndef VZEROUPPER > > @@ -41,349 +43,459 @@ > > # endif > > > > # define VEC_SIZE 32 > > +# define PAGE_SIZE 4096 > > > > .section SECTION(.text),"ax",@progbits > > ENTRY (STRLEN) > > # ifdef USE_AS_STRNLEN > > - /* Check for zero length. */ > > + /* Check zero length. */ > > test %RSI_LP, %RSI_LP > > jz L(zero) > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > + mov %RSI_LP, %R8_LP > > # ifdef USE_AS_WCSLEN > > shl $2, %RSI_LP > > # elif defined __ILP32__ > > /* Clear the upper 32 bits. */ > > movl %esi, %esi > > # endif > > - mov %RSI_LP, %R8_LP > > # endif > > - movl %edi, %ecx > > + movl %edi, %eax > > movq %rdi, %rdx > > vpxor %xmm0, %xmm0, %xmm0 > > - > > + /* Clear high bits from edi. Only keeping bits relevant to page > > + cross check. */ > > + andl $(PAGE_SIZE - 1), %eax > > /* Check if we may cross page boundary with one vector load. */ > > - andl $(2 * VEC_SIZE - 1), %ecx > > - cmpl $VEC_SIZE, %ecx > > - ja L(cros_page_boundary) > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > # ifdef USE_AS_STRNLEN > > - jnz L(first_vec_x0_check) > > - /* Adjust length and check the end of data. */ > > - subq $VEC_SIZE, %rsi > > - jbe L(max) > > -# else > > - jnz L(first_vec_x0) > > + /* If length < VEC_SIZE handle special. */ > > + cmpq $VEC_SIZE, %rsi > > + jbe L(first_vec_x0) > > # endif > > - > > - /* Align data for aligned loads in the loop. */ > > - addq $VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > + /* If empty continue to aligned_more. Otherwise return bit > > + position of first match. */ > > + testl %eax, %eax > > + jz L(aligned_more) > > + tzcntl %eax, %eax > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > # ifdef USE_AS_STRNLEN > > - /* Adjust length. */ > > - addq %rcx, %rsi > > +L(zero): > > + xorl %eax, %eax > > + ret > > > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > + .p2align 4 > > +L(first_vec_x0): > > + /* Set bit for max len so that tzcnt will return min of max len > > + and position of first match. */ > > + btsq %rsi, %rax > > + tzcntl %eax, %eax > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > # endif > > - jmp L(more_4x_vec) > > > > .p2align 4 > > -L(cros_page_boundary): > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - /* Remove the leading bytes. */ > > - sarl %cl, %eax > > - testl %eax, %eax > > - jz L(aligned_more) > > +L(first_vec_x1): > > tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 4 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + incl %edi > > + addl %edi, %eax > > # endif > > - addq %rdi, %rax > > - addq %rcx, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > - shrq $2, %rax > > + shrl $2, %eax > > # endif > > -L(return_vzeroupper): > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > + VZEROUPPER_RETURN > > > > .p2align 4 > > -L(aligned_more): > > +L(first_vec_x2): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > - to void possible addition overflow. */ > > - negq %rcx > > - addq $VEC_SIZE, %rcx > > - > > - /* Check the end of data. */ > > - subq %rcx, %rsi > > - jbe L(max) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 3 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE + 1), %edi > > + addl %edi, %eax > > # endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > - addq $VEC_SIZE, %rdi > > + .p2align 4 > > +L(first_vec_x3): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > +# ifdef USE_AS_STRNLEN > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE * 2 + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE * 2 + 1), %edi > > + addl %edi, %eax > > +# endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > + .p2align 4 > > +L(first_vec_x4): > > + tzcntl %eax, %eax > > + /* Safe to use 32 bit instructions as these are only called for > > + size = [1, 159]. */ > > # ifdef USE_AS_STRNLEN > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > + /* Use ecx which was computed earlier to compute correct value. > > + */ > > + subl $(VEC_SIZE + 1), %ecx > > + addl %ecx, %eax > > +# else > > + subl %edx, %edi > > + addl $(VEC_SIZE * 3 + 1), %edi > > + addl %edi, %eax > > # endif > > +# ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > +# endif > > + VZEROUPPER_RETURN > > > > -L(more_4x_vec): > > + .p2align 5 > > +L(aligned_more): > > + /* Align data to VEC_SIZE - 1. This is the same number of > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > + code on the x4 check. */ > > + orq $(VEC_SIZE - 1), %rdi > > +L(cross_page_continue): > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > since data is only aligned to VEC_SIZE. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > +# ifdef USE_AS_STRNLEN > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > + it simplies the logic in last_4x_vec_or_less. */ > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > + subq %rdx, %rcx > > +# endif > > + /* Load first VEC regardless. */ > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > +# ifdef USE_AS_STRNLEN > > + /* Adjust length. If near end handle specially. */ > > + subq %rcx, %rsi > > + jb L(last_4x_vec_or_less) > > +# endif > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x1) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x2) > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x3) > > > > - addq $(VEC_SIZE * 4), %rdi > > - > > -# ifdef USE_AS_STRNLEN > > - subq $(VEC_SIZE * 4), %rsi > > - jbe L(last_4x_vec_or_less) > > -# endif > > - > > - /* Align data to 4 * VEC_SIZE. */ > > - movq %rdi, %rcx > > - andl $(4 * VEC_SIZE - 1), %ecx > > - andq $-(4 * VEC_SIZE), %rdi > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x4) > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > # ifdef USE_AS_STRNLEN > > - /* Adjust length. */ > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > + jbe L(last_4x_vec_or_less_load) > > + incq %rdi > > + movl %edi, %ecx > > + orq $(VEC_SIZE * 4 - 1), %rdi > > + andl $(VEC_SIZE * 4 - 1), %ecx > > + /* Readjust length. */ > > addq %rcx, %rsi > > +# else > > + incq %rdi > > + orq $(VEC_SIZE * 4 - 1), %rdi > > # endif > > - > > + /* Compare 4 * VEC at a time forward. */ > > .p2align 4 > > L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > - vmovdqa (%rdi), %ymm1 > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > - VPMINU %ymm1, %ymm2, %ymm5 > > - VPMINU %ymm3, %ymm4, %ymm6 > > - VPMINU %ymm5, %ymm6, %ymm5 > > - > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > - vpmovmskb %ymm5, %eax > > - testl %eax, %eax > > - jnz L(4x_vec_end) > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > -# ifndef USE_AS_STRNLEN > > - jmp L(loop_4x_vec) > > -# else > > +# ifdef USE_AS_STRNLEN > > + /* Break if at end of length. */ > > subq $(VEC_SIZE * 4), %rsi > > - ja L(loop_4x_vec) > > - > > -L(last_4x_vec_or_less): > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > - addl $(VEC_SIZE * 2), %esi > > - jle L(last_2x_vec) > > + jb L(last_4x_vec_or_less_cmpeq) > > +# endif > > + /* Save some code size by microfusing VPMINU with the load. Since > > + the matches in ymm2/ymm4 can only be returned if there where no > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > + */ > > + vmovdqa 1(%rdi), %ymm1 > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > + > > + VPMINU %ymm2, %ymm4, %ymm5 > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > + vpmovmskb %ymm5, %ecx > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > + jz L(loop_4x_vec) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + subq %rdx, %rdi > > testl %eax, %eax > > + jnz L(last_vec_return_x0) > > > > - jnz L(first_vec_x2_check) > > - subl $VEC_SIZE, %esi > > - jle L(max) > > - > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > + vpmovmskb %ymm2, %eax > > testl %eax, %eax > > - > > - jnz L(first_vec_x3_check) > > - movq %r8, %rax > > -# ifdef USE_AS_WCSLEN > > + jnz L(last_vec_return_x1) > > + > > + /* Combine last 2 VEC. */ > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > + vpmovmskb %ymm3, %eax > > + /* rcx has combined result from all 4 VEC. It will only be used if > > + the first 3 other VEC all did not contain a match. */ > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > + subq $(VEC_SIZE * 2 - 1), %rdi > > + addq %rdi, %rax > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > + > > +# ifdef USE_AS_STRNLEN > > .p2align 4 > > -L(last_2x_vec): > > - addl $(VEC_SIZE * 2), %esi > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > +L(last_4x_vec_or_less_load): > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > + subq $-(VEC_SIZE * 4), %rdi > > +L(last_4x_vec_or_less_cmpeq): > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > +L(last_4x_vec_or_less): > > > > - jnz L(first_vec_x0_check) > > - subl $VEC_SIZE, %esi > > - jle L(max) > > + vpmovmskb %ymm1, %eax > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > + VEC_SIZE * 4. */ > > + testl $(VEC_SIZE * 2), %esi > > + jnz L(last_4x_vec) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + /* length may have been negative or positive by an offset of > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > + that. */ > > + andl $(VEC_SIZE * 4 - 1), %esi > > testl %eax, %eax > > - jnz L(first_vec_x1_check) > > - movq %r8, %rax > > -# ifdef USE_AS_WCSLEN > > - shrq $2, %rax > > -# endif > > - VZEROUPPER_RETURN > > + jnz L(last_vec_x1_check) > > > > - .p2align 4 > > -L(first_vec_x0_check): > > + subl $VEC_SIZE, %esi > > + jb L(max) > > + > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > tzcntl %eax, %eax > > /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > +# endif > > > > .p2align 4 > > -L(first_vec_x1_check): > > +L(last_vec_return_x0): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $VEC_SIZE, %rax > > + subq $(VEC_SIZE * 4 - 1), %rdi > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x2_check): > > +L(last_vec_return_x1): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $(VEC_SIZE * 2), %rax > > + subq $(VEC_SIZE * 3 - 1), %rdi > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > +# ifdef USE_AS_STRNLEN > > .p2align 4 > > -L(first_vec_x3_check): > > +L(last_vec_x1_check): > > + > > tzcntl %eax, %eax > > /* Check the end of data. */ > > - cmpq %rax, %rsi > > - jbe L(max) > > - addq $(VEC_SIZE * 3), %rax > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + incl %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > > > - .p2align 4 > > L(max): > > movq %r8, %rax > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > +L(last_4x_vec): > > + /* Test first 2x VEC normally. */ > > + testl %eax, %eax > > + jnz L(last_vec_x1) > > + > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(last_vec_x2) > > + > > + /* Normalize length. */ > > + andl $(VEC_SIZE * 4 - 1), %esi > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(last_vec_x3) > > + > > + subl $(VEC_SIZE * 3), %esi > > + jb L(max) > > + > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + tzcntl %eax, %eax > > + /* Check the end of data. */ > > + cmpl %eax, %esi > > + jb L(max) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE * 3 + 1), %eax > > + addq %rdi, %rax > > # ifdef USE_AS_WCSLEN > > shrq $2, %rax > > # endif > > VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(zero): > > - xorl %eax, %eax > > - ret > > -# endif > > > > .p2align 4 > > -L(first_vec_x0): > > +L(last_vec_x1): > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > + instructions. */ > > tzcntl %eax, %eax > > + subq %rdx, %rdi > > + incl %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x1): > > +L(last_vec_x2): > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > + instructions. */ > > tzcntl %eax, %eax > > - addq $VEC_SIZE, %rax > > + subq %rdx, %rdi > > + addl $(VEC_SIZE + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x2): > > +L(last_vec_x3): > > tzcntl %eax, %eax > > - addq $(VEC_SIZE * 2), %rax > > + subl $(VEC_SIZE * 2), %esi > > + /* Check the end of data. */ > > + cmpl %eax, %esi > > + jb L(max_end) > > + subq %rdx, %rdi > > + addl $(VEC_SIZE * 2 + 1), %eax > > addq %rdi, %rax > > - subq %rdx, %rax > > -# ifdef USE_AS_WCSLEN > > +# ifdef USE_AS_WCSLEN > > shrq $2, %rax > > -# endif > > +# endif > > + VZEROUPPER_RETURN > > +L(max_end): > > + movq %r8, %rax > > VZEROUPPER_RETURN > > +# endif > > > > + /* Cold case for crossing page with first load. */ > > .p2align 4 > > -L(4x_vec_end): > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > - vpmovmskb %ymm2, %eax > > +L(cross_page_boundary): > > + /* Align data to VEC_SIZE - 1. */ > > + orq $(VEC_SIZE - 1), %rdi > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > + so no need to manually mod rdx. */ > > + sarxl %edx, %eax, %eax > > This is a BMI2 instruction, which is not necessary available when AVX2 > is available. This causes SIGILL on some CPU. I have reported that in > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 This is not a bug on master as: commit 83c5b368226c34a2f0a5287df40fc290b2b34359 Author: H.J. Lu <hjl.tools@gmail.com> Date: Mon Apr 19 10:45:07 2021 -0700 x86-64: Require BMI2 for strchr-avx2.S is already in tree. The issue is the avx2 changes where backported w.o H.J's changes. > > Regards > Aurelien > > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net
Attached patch fixes BZ# 29611. I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know if there is any objection. On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > mostly small things but they add up to roughly 10-30% performance > > > improvement for strlen. The results for strnlen are bit more > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > are all passing. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index c377cab629..651b32908e 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > IFUNC_IMPL (i, name, strlen, > > > IFUNC_IMPL_ADD (array, i, strlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strlen, > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > IFUNC_IMPL (i, name, strnlen, > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __strnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __strnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > IFUNC_IMPL (i, name, wcslen, > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcslen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcslen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > IFUNC_IMPL (i, name, wcsnlen, > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > - CPU_FEATURE_USABLE (AVX2), > > > + (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > __wcsnlen_avx2) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > (CPU_FEATURE_USABLE (AVX2) > > > + && CPU_FEATURE_USABLE (BMI2) > > > && CPU_FEATURE_USABLE (RTM)), > > > __wcsnlen_avx2_rtm) > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > @@ -27,9 +27,11 @@ > > > # ifdef USE_AS_WCSLEN > > > # define VPCMPEQ vpcmpeqd > > > # define VPMINU vpminud > > > +# define CHAR_SIZE 4 > > > # else > > > # define VPCMPEQ vpcmpeqb > > > # define VPMINU vpminub > > > +# define CHAR_SIZE 1 > > > # endif > > > > > > # ifndef VZEROUPPER > > > @@ -41,349 +43,459 @@ > > > # endif > > > > > > # define VEC_SIZE 32 > > > +# define PAGE_SIZE 4096 > > > > > > .section SECTION(.text),"ax",@progbits > > > ENTRY (STRLEN) > > > # ifdef USE_AS_STRNLEN > > > - /* Check for zero length. */ > > > + /* Check zero length. */ > > > test %RSI_LP, %RSI_LP > > > jz L(zero) > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > + mov %RSI_LP, %R8_LP > > > # ifdef USE_AS_WCSLEN > > > shl $2, %RSI_LP > > > # elif defined __ILP32__ > > > /* Clear the upper 32 bits. */ > > > movl %esi, %esi > > > # endif > > > - mov %RSI_LP, %R8_LP > > > # endif > > > - movl %edi, %ecx > > > + movl %edi, %eax > > > movq %rdi, %rdx > > > vpxor %xmm0, %xmm0, %xmm0 > > > - > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > + cross check. */ > > > + andl $(PAGE_SIZE - 1), %eax > > > /* Check if we may cross page boundary with one vector load. */ > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > - cmpl $VEC_SIZE, %ecx > > > - ja L(cros_page_boundary) > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(cross_page_boundary) > > > > > > /* Check the first VEC_SIZE bytes. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > # ifdef USE_AS_STRNLEN > > > - jnz L(first_vec_x0_check) > > > - /* Adjust length and check the end of data. */ > > > - subq $VEC_SIZE, %rsi > > > - jbe L(max) > > > -# else > > > - jnz L(first_vec_x0) > > > + /* If length < VEC_SIZE handle special. */ > > > + cmpq $VEC_SIZE, %rsi > > > + jbe L(first_vec_x0) > > > # endif > > > - > > > - /* Align data for aligned loads in the loop. */ > > > - addq $VEC_SIZE, %rdi > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > + /* If empty continue to aligned_more. Otherwise return bit > > > + position of first match. */ > > > + testl %eax, %eax > > > + jz L(aligned_more) > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > - addq %rcx, %rsi > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + .p2align 4 > > > +L(first_vec_x0): > > > + /* Set bit for max len so that tzcnt will return min of max len > > > + and position of first match. */ > > > + btsq %rsi, %rax > > > + tzcntl %eax, %eax > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > # endif > > > - jmp L(more_4x_vec) > > > > > > .p2align 4 > > > -L(cros_page_boundary): > > > - andl $(VEC_SIZE - 1), %ecx > > > - andq $-VEC_SIZE, %rdi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - /* Remove the leading bytes. */ > > > - sarl %cl, %eax > > > - testl %eax, %eax > > > - jz L(aligned_more) > > > +L(first_vec_x1): > > > tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + incl %edi > > > + addl %edi, %eax > > > # endif > > > - addq %rdi, %rax > > > - addq %rcx, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > + shrl $2, %eax > > > # endif > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > + VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(aligned_more): > > > +L(first_vec_x2): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > - to void possible addition overflow. */ > > > - negq %rcx > > > - addq $VEC_SIZE, %rcx > > > - > > > - /* Check the end of data. */ > > > - subq %rcx, %rsi > > > - jbe L(max) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > - addq $VEC_SIZE, %rdi > > > + .p2align 4 > > > +L(first_vec_x3): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > +# ifdef USE_AS_STRNLEN > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > + addl %edi, %eax > > > +# endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > + .p2align 4 > > > +L(first_vec_x4): > > > + tzcntl %eax, %eax > > > + /* Safe to use 32 bit instructions as these are only called for > > > + size = [1, 159]. */ > > > # ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > + /* Use ecx which was computed earlier to compute correct value. > > > + */ > > > + subl $(VEC_SIZE + 1), %ecx > > > + addl %ecx, %eax > > > +# else > > > + subl %edx, %edi > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > + addl %edi, %eax > > > # endif > > > +# ifdef USE_AS_WCSLEN > > > + shrl $2, %eax > > > +# endif > > > + VZEROUPPER_RETURN > > > > > > -L(more_4x_vec): > > > + .p2align 5 > > > +L(aligned_more): > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > + code on the x4 check. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > +L(cross_page_continue): > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > since data is only aligned to VEC_SIZE. */ > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > +# ifdef USE_AS_STRNLEN > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > + it simplies the logic in last_4x_vec_or_less. */ > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > + subq %rdx, %rcx > > > +# endif > > > + /* Load first VEC regardless. */ > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +# ifdef USE_AS_STRNLEN > > > + /* Adjust length. If near end handle specially. */ > > > + subq %rcx, %rsi > > > + jb L(last_4x_vec_or_less) > > > +# endif > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x2) > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > testl %eax, %eax > > > jnz L(first_vec_x3) > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifdef USE_AS_STRNLEN > > > - subq $(VEC_SIZE * 4), %rsi > > > - jbe L(last_4x_vec_or_less) > > > -# endif > > > - > > > - /* Align data to 4 * VEC_SIZE. */ > > > - movq %rdi, %rcx > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > - andq $-(4 * VEC_SIZE), %rdi > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(first_vec_x4) > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > # ifdef USE_AS_STRNLEN > > > - /* Adjust length. */ > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > + jbe L(last_4x_vec_or_less_load) > > > + incq %rdi > > > + movl %edi, %ecx > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > + /* Readjust length. */ > > > addq %rcx, %rsi > > > +# else > > > + incq %rdi > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > # endif > > > - > > > + /* Compare 4 * VEC at a time forward. */ > > > .p2align 4 > > > L(loop_4x_vec): > > > - /* Compare 4 * VEC at a time forward. */ > > > - vmovdqa (%rdi), %ymm1 > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > - > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > - vpmovmskb %ymm5, %eax > > > - testl %eax, %eax > > > - jnz L(4x_vec_end) > > > - > > > - addq $(VEC_SIZE * 4), %rdi > > > - > > > -# ifndef USE_AS_STRNLEN > > > - jmp L(loop_4x_vec) > > > -# else > > > +# ifdef USE_AS_STRNLEN > > > + /* Break if at end of length. */ > > > subq $(VEC_SIZE * 4), %rsi > > > - ja L(loop_4x_vec) > > > - > > > -L(last_4x_vec_or_less): > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > - addl $(VEC_SIZE * 2), %esi > > > - jle L(last_2x_vec) > > > + jb L(last_4x_vec_or_less_cmpeq) > > > +# endif > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > + */ > > > + vmovdqa 1(%rdi), %ymm1 > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > + > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > + vpmovmskb %ymm5, %ecx > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + testl %ecx, %ecx > > > + jz L(loop_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x1) > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + subq %rdx, %rdi > > > testl %eax, %eax > > > + jnz L(last_vec_return_x0) > > > > > > - jnz L(first_vec_x2_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > - > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > + vpmovmskb %ymm2, %eax > > > testl %eax, %eax > > > - > > > - jnz L(first_vec_x3_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > + jnz L(last_vec_return_x1) > > > + > > > + /* Combine last 2 VEC. */ > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > + vpmovmskb %ymm3, %eax > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > + the first 3 other VEC all did not contain a match. */ > > > + salq $32, %rcx > > > + orq %rcx, %rax > > > + tzcntq %rax, %rax > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > + addq %rdi, %rax > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > + > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(last_2x_vec): > > > - addl $(VEC_SIZE * 2), %esi > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > +L(last_4x_vec_or_less_load): > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > + subq $-(VEC_SIZE * 4), %rdi > > > +L(last_4x_vec_or_less_cmpeq): > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > +L(last_4x_vec_or_less): > > > > > > - jnz L(first_vec_x0_check) > > > - subl $VEC_SIZE, %esi > > > - jle L(max) > > > + vpmovmskb %ymm1, %eax > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > + VEC_SIZE * 4. */ > > > + testl $(VEC_SIZE * 2), %esi > > > + jnz L(last_4x_vec) > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + /* length may have been negative or positive by an offset of > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > + that. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > testl %eax, %eax > > > - jnz L(first_vec_x1_check) > > > - movq %r8, %rax > > > -# ifdef USE_AS_WCSLEN > > > - shrq $2, %rax > > > -# endif > > > - VZEROUPPER_RETURN > > > + jnz L(last_vec_x1_check) > > > > > > - .p2align 4 > > > -L(first_vec_x0_check): > > > + subl $VEC_SIZE, %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > +# endif > > > > > > .p2align 4 > > > -L(first_vec_x1_check): > > > +L(last_vec_return_x0): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $VEC_SIZE, %rax > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2_check): > > > +L(last_vec_return_x1): > > > tzcntl %eax, %eax > > > - /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 2), %rax > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > +# ifdef USE_AS_STRNLEN > > > .p2align 4 > > > -L(first_vec_x3_check): > > > +L(last_vec_x1_check): > > > + > > > tzcntl %eax, %eax > > > /* Check the end of data. */ > > > - cmpq %rax, %rsi > > > - jbe L(max) > > > - addq $(VEC_SIZE * 3), %rax > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > L(max): > > > movq %r8, %rax > > > + VZEROUPPER_RETURN > > > + > > > + .p2align 4 > > > +L(last_4x_vec): > > > + /* Test first 2x VEC normally. */ > > > + testl %eax, %eax > > > + jnz L(last_vec_x1) > > > + > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x2) > > > + > > > + /* Normalize length. */ > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + testl %eax, %eax > > > + jnz L(last_vec_x3) > > > + > > > + subl $(VEC_SIZE * 3), %esi > > > + jb L(max) > > > + > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + tzcntl %eax, %eax > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > + addq %rdi, %rax > > > # ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > # endif > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(zero): > > > - xorl %eax, %eax > > > - ret > > > -# endif > > > > > > .p2align 4 > > > -L(first_vec_x0): > > > +L(last_vec_x1): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > + subq %rdx, %rdi > > > + incl %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x1): > > > +L(last_vec_x2): > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > + instructions. */ > > > tzcntl %eax, %eax > > > - addq $VEC_SIZE, %rax > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > -L(first_vec_x2): > > > +L(last_vec_x3): > > > tzcntl %eax, %eax > > > - addq $(VEC_SIZE * 2), %rax > > > + subl $(VEC_SIZE * 2), %esi > > > + /* Check the end of data. */ > > > + cmpl %eax, %esi > > > + jb L(max_end) > > > + subq %rdx, %rdi > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > addq %rdi, %rax > > > - subq %rdx, %rax > > > -# ifdef USE_AS_WCSLEN > > > +# ifdef USE_AS_WCSLEN > > > shrq $2, %rax > > > -# endif > > > +# endif > > > + VZEROUPPER_RETURN > > > +L(max_end): > > > + movq %r8, %rax > > > VZEROUPPER_RETURN > > > +# endif > > > > > > + /* Cold case for crossing page with first load. */ > > > .p2align 4 > > > -L(4x_vec_end): > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > +L(cross_page_boundary): > > > + /* Align data to VEC_SIZE - 1. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > + so no need to manually mod rdx. */ > > > + sarxl %edx, %eax, %eax > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > is available. This causes SIGILL on some CPU. I have reported that in > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > This is not a bug on master as: > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > Author: H.J. Lu <hjl.tools@gmail.com> > Date: Mon Apr 19 10:45:07 2021 -0700 > > x86-64: Require BMI2 for strchr-avx2.S > > is already in tree. The issue is the avx2 changes where backported > w.o H.J's changes. > > > > Regards > > Aurelien > > > > -- > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > aurelien@aurel32.net http://www.aurel32.net
Please Remove me from this string. I should not be on it.
On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S Can you post these as separate emails with the patches embedded instead of attached? > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > mostly small things but they add up to roughly 10-30% performance > > > > improvement for strlen. The results for strnlen are bit more > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > are all passing. > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index c377cab629..651b32908e 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > IFUNC_IMPL (i, name, strlen, > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __strlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __strlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > IFUNC_IMPL (i, name, strnlen, > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __strnlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __strnlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > IFUNC_IMPL (i, name, wcslen, > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __wcslen_avx2) > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __wcslen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > - CPU_FEATURE_USABLE (AVX2), > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > __wcsnlen_avx2) > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > && CPU_FEATURE_USABLE (RTM)), > > > > __wcsnlen_avx2_rtm) > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > @@ -27,9 +27,11 @@ > > > > # ifdef USE_AS_WCSLEN > > > > # define VPCMPEQ vpcmpeqd > > > > # define VPMINU vpminud > > > > +# define CHAR_SIZE 4 > > > > # else > > > > # define VPCMPEQ vpcmpeqb > > > > # define VPMINU vpminub > > > > +# define CHAR_SIZE 1 > > > > # endif > > > > > > > > # ifndef VZEROUPPER > > > > @@ -41,349 +43,459 @@ > > > > # endif > > > > > > > > # define VEC_SIZE 32 > > > > +# define PAGE_SIZE 4096 > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > ENTRY (STRLEN) > > > > # ifdef USE_AS_STRNLEN > > > > - /* Check for zero length. */ > > > > + /* Check zero length. */ > > > > test %RSI_LP, %RSI_LP > > > > jz L(zero) > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > + mov %RSI_LP, %R8_LP > > > > # ifdef USE_AS_WCSLEN > > > > shl $2, %RSI_LP > > > > # elif defined __ILP32__ > > > > /* Clear the upper 32 bits. */ > > > > movl %esi, %esi > > > > # endif > > > > - mov %RSI_LP, %R8_LP > > > > # endif > > > > - movl %edi, %ecx > > > > + movl %edi, %eax > > > > movq %rdi, %rdx > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > - > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > + cross check. */ > > > > + andl $(PAGE_SIZE - 1), %eax > > > > /* Check if we may cross page boundary with one vector load. */ > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > - cmpl $VEC_SIZE, %ecx > > > > - ja L(cros_page_boundary) > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(cross_page_boundary) > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > # ifdef USE_AS_STRNLEN > > > > - jnz L(first_vec_x0_check) > > > > - /* Adjust length and check the end of data. */ > > > > - subq $VEC_SIZE, %rsi > > > > - jbe L(max) > > > > -# else > > > > - jnz L(first_vec_x0) > > > > + /* If length < VEC_SIZE handle special. */ > > > > + cmpq $VEC_SIZE, %rsi > > > > + jbe L(first_vec_x0) > > > > # endif > > > > - > > > > - /* Align data for aligned loads in the loop. */ > > > > - addq $VEC_SIZE, %rdi > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - andq $-VEC_SIZE, %rdi > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > + position of first match. */ > > > > + testl %eax, %eax > > > > + jz L(aligned_more) > > > > + tzcntl %eax, %eax > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > # ifdef USE_AS_STRNLEN > > > > - /* Adjust length. */ > > > > - addq %rcx, %rsi > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > + .p2align 4 > > > > +L(first_vec_x0): > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > + and position of first match. */ > > > > + btsq %rsi, %rax > > > > + tzcntl %eax, %eax > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > # endif > > > > - jmp L(more_4x_vec) > > > > > > > > .p2align 4 > > > > -L(cros_page_boundary): > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - andq $-VEC_SIZE, %rdi > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - /* Remove the leading bytes. */ > > > > - sarl %cl, %eax > > > > - testl %eax, %eax > > > > - jz L(aligned_more) > > > > +L(first_vec_x1): > > > > tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + incl %edi > > > > + addl %edi, %eax > > > > # endif > > > > - addq %rdi, %rax > > > > - addq %rcx, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > - shrq $2, %rax > > > > + shrl $2, %eax > > > > # endif > > > > -L(return_vzeroupper): > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > + VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(aligned_more): > > > > +L(first_vec_x2): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > - to void possible addition overflow. */ > > > > - negq %rcx > > > > - addq $VEC_SIZE, %rcx > > > > - > > > > - /* Check the end of data. */ > > > > - subq %rcx, %rsi > > > > - jbe L(max) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE + 1), %edi > > > > + addl %edi, %eax > > > > # endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > - addq $VEC_SIZE, %rdi > > > > + .p2align 4 > > > > +L(first_vec_x3): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > + addl %edi, %eax > > > > +# endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > + .p2align 4 > > > > +L(first_vec_x4): > > > > + tzcntl %eax, %eax > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > + size = [1, 159]. */ > > > > # ifdef USE_AS_STRNLEN > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > + */ > > > > + subl $(VEC_SIZE + 1), %ecx > > > > + addl %ecx, %eax > > > > +# else > > > > + subl %edx, %edi > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > + addl %edi, %eax > > > > # endif > > > > +# ifdef USE_AS_WCSLEN > > > > + shrl $2, %eax > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > > > > > -L(more_4x_vec): > > > > + .p2align 5 > > > > +L(aligned_more): > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > + code on the x4 check. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > +L(cross_page_continue): > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > since data is only aligned to VEC_SIZE. */ > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > +# ifdef USE_AS_STRNLEN > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > + subq %rdx, %rcx > > > > +# endif > > > > + /* Load first VEC regardless. */ > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Adjust length. If near end handle specially. */ > > > > + subq %rcx, %rsi > > > > + jb L(last_4x_vec_or_less) > > > > +# endif > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x1) > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x2) > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > testl %eax, %eax > > > > jnz L(first_vec_x3) > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - > > > > -# ifdef USE_AS_STRNLEN > > > > - subq $(VEC_SIZE * 4), %rsi > > > > - jbe L(last_4x_vec_or_less) > > > > -# endif > > > > - > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > - movq %rdi, %rcx > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x4) > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > # ifdef USE_AS_STRNLEN > > > > - /* Adjust length. */ > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > + jbe L(last_4x_vec_or_less_load) > > > > + incq %rdi > > > > + movl %edi, %ecx > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > + /* Readjust length. */ > > > > addq %rcx, %rsi > > > > +# else > > > > + incq %rdi > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > # endif > > > > - > > > > + /* Compare 4 * VEC at a time forward. */ > > > > .p2align 4 > > > > L(loop_4x_vec): > > > > - /* Compare 4 * VEC at a time forward. */ > > > > - vmovdqa (%rdi), %ymm1 > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > - > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > - vpmovmskb %ymm5, %eax > > > > - testl %eax, %eax > > > > - jnz L(4x_vec_end) > > > > - > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - > > > > -# ifndef USE_AS_STRNLEN > > > > - jmp L(loop_4x_vec) > > > > -# else > > > > +# ifdef USE_AS_STRNLEN > > > > + /* Break if at end of length. */ > > > > subq $(VEC_SIZE * 4), %rsi > > > > - ja L(loop_4x_vec) > > > > - > > > > -L(last_4x_vec_or_less): > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > - addl $(VEC_SIZE * 2), %esi > > > > - jle L(last_2x_vec) > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > +# endif > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > + */ > > > > + vmovdqa 1(%rdi), %ymm1 > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > + > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + testl %ecx, %ecx > > > > + jz L(loop_4x_vec) > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + subq %rdx, %rdi > > > > testl %eax, %eax > > > > + jnz L(last_vec_return_x0) > > > > > > > > - jnz L(first_vec_x2_check) > > > > - subl $VEC_SIZE, %esi > > > > - jle L(max) > > > > - > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > + vpmovmskb %ymm2, %eax > > > > testl %eax, %eax > > > > - > > > > - jnz L(first_vec_x3_check) > > > > - movq %r8, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > + jnz L(last_vec_return_x1) > > > > + > > > > + /* Combine last 2 VEC. */ > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > + vpmovmskb %ymm3, %eax > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > + the first 3 other VEC all did not contain a match. */ > > > > + salq $32, %rcx > > > > + orq %rcx, %rax > > > > + tzcntq %rax, %rax > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > + addq %rdi, %rax > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > + > > > > +# ifdef USE_AS_STRNLEN > > > > .p2align 4 > > > > -L(last_2x_vec): > > > > - addl $(VEC_SIZE * 2), %esi > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > +L(last_4x_vec_or_less_load): > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > +L(last_4x_vec_or_less_cmpeq): > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > +L(last_4x_vec_or_less): > > > > > > > > - jnz L(first_vec_x0_check) > > > > - subl $VEC_SIZE, %esi > > > > - jle L(max) > > > > + vpmovmskb %ymm1, %eax > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > + VEC_SIZE * 4. */ > > > > + testl $(VEC_SIZE * 2), %esi > > > > + jnz L(last_4x_vec) > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + /* length may have been negative or positive by an offset of > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > + that. */ > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > testl %eax, %eax > > > > - jnz L(first_vec_x1_check) > > > > - movq %r8, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > - shrq $2, %rax > > > > -# endif > > > > - VZEROUPPER_RETURN > > > > + jnz L(last_vec_x1_check) > > > > > > > > - .p2align 4 > > > > -L(first_vec_x0_check): > > > > + subl $VEC_SIZE, %esi > > > > + jb L(max) > > > > + > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > tzcntl %eax, %eax > > > > /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > +# endif > > > > > > > > .p2align 4 > > > > -L(first_vec_x1_check): > > > > +L(last_vec_return_x0): > > > > tzcntl %eax, %eax > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $VEC_SIZE, %rax > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x2_check): > > > > +L(last_vec_return_x1): > > > > tzcntl %eax, %eax > > > > - /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $(VEC_SIZE * 2), %rax > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > .p2align 4 > > > > -L(first_vec_x3_check): > > > > +L(last_vec_x1_check): > > > > + > > > > tzcntl %eax, %eax > > > > /* Check the end of data. */ > > > > - cmpq %rax, %rsi > > > > - jbe L(max) > > > > - addq $(VEC_SIZE * 3), %rax > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + incl %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > L(max): > > > > movq %r8, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + .p2align 4 > > > > +L(last_4x_vec): > > > > + /* Test first 2x VEC normally. */ > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x1) > > > > + > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x2) > > > > + > > > > + /* Normalize length. */ > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + testl %eax, %eax > > > > + jnz L(last_vec_x3) > > > > + > > > > + subl $(VEC_SIZE * 3), %esi > > > > + jb L(max) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + tzcntl %eax, %eax > > > > + /* Check the end of data. */ > > > > + cmpl %eax, %esi > > > > + jb L(max) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > + addq %rdi, %rax > > > > # ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > # endif > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > -L(zero): > > > > - xorl %eax, %eax > > > > - ret > > > > -# endif > > > > > > > > .p2align 4 > > > > -L(first_vec_x0): > > > > +L(last_vec_x1): > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > + instructions. */ > > > > tzcntl %eax, %eax > > > > + subq %rdx, %rdi > > > > + incl %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x1): > > > > +L(last_vec_x2): > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > + instructions. */ > > > > tzcntl %eax, %eax > > > > - addq $VEC_SIZE, %rax > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > -L(first_vec_x2): > > > > +L(last_vec_x3): > > > > tzcntl %eax, %eax > > > > - addq $(VEC_SIZE * 2), %rax > > > > + subl $(VEC_SIZE * 2), %esi > > > > + /* Check the end of data. */ > > > > + cmpl %eax, %esi > > > > + jb L(max_end) > > > > + subq %rdx, %rdi > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > addq %rdi, %rax > > > > - subq %rdx, %rax > > > > -# ifdef USE_AS_WCSLEN > > > > +# ifdef USE_AS_WCSLEN > > > > shrq $2, %rax > > > > -# endif > > > > +# endif > > > > + VZEROUPPER_RETURN > > > > +L(max_end): > > > > + movq %r8, %rax > > > > VZEROUPPER_RETURN > > > > +# endif > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > .p2align 4 > > > > -L(4x_vec_end): > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > - vpmovmskb %ymm2, %eax > > > > +L(cross_page_boundary): > > > > + /* Align data to VEC_SIZE - 1. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > + so no need to manually mod rdx. */ > > > > + sarxl %edx, %eax, %eax > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > is available. This causes SIGILL on some CPU. I have reported that in > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > This is not a bug on master as: > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > Author: H.J. Lu <hjl.tools@gmail.com> > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > is already in tree. The issue is the avx2 changes where backported > > w.o H.J's changes. > > > > > > Regards > > > Aurelien > > > > > > -- > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > aurelien@aurel32.net http://www.aurel32.net
On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > Can you post these as separate emails with the patches embedded instead of > attached? > > > Patches are also posted on bug report 29611. https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > > mostly small things but they add up to roughly 10-30% performance > > > > > improvement for strlen. The results for strnlen are bit more > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > > are all passing. > > > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > > --- > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > index c377cab629..651b32908e 100644 > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > > IFUNC_IMPL (i, name, strlen, > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __strlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __strlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > > IFUNC_IMPL (i, name, strnlen, > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __strnlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __strnlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > > IFUNC_IMPL (i, name, wcslen, > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __wcslen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __wcslen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > __wcsnlen_avx2) > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > __wcsnlen_avx2_rtm) > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > @@ -27,9 +27,11 @@ > > > > > # ifdef USE_AS_WCSLEN > > > > > # define VPCMPEQ vpcmpeqd > > > > > # define VPMINU vpminud > > > > > +# define CHAR_SIZE 4 > > > > > # else > > > > > # define VPCMPEQ vpcmpeqb > > > > > # define VPMINU vpminub > > > > > +# define CHAR_SIZE 1 > > > > > # endif > > > > > > > > > > # ifndef VZEROUPPER > > > > > @@ -41,349 +43,459 @@ > > > > > # endif > > > > > > > > > > # define VEC_SIZE 32 > > > > > +# define PAGE_SIZE 4096 > > > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > > ENTRY (STRLEN) > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Check for zero length. */ > > > > > + /* Check zero length. */ > > > > > test %RSI_LP, %RSI_LP > > > > > jz L(zero) > > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > > + mov %RSI_LP, %R8_LP > > > > > # ifdef USE_AS_WCSLEN > > > > > shl $2, %RSI_LP > > > > > # elif defined __ILP32__ > > > > > /* Clear the upper 32 bits. */ > > > > > movl %esi, %esi > > > > > # endif > > > > > - mov %RSI_LP, %R8_LP > > > > > # endif > > > > > - movl %edi, %ecx > > > > > + movl %edi, %eax > > > > > movq %rdi, %rdx > > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > > - > > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > > + cross check. */ > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > /* Check if we may cross page boundary with one vector load. */ > > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > > - cmpl $VEC_SIZE, %ecx > > > > > - ja L(cros_page_boundary) > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > + ja L(cross_page_boundary) > > > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - > > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > # ifdef USE_AS_STRNLEN > > > > > - jnz L(first_vec_x0_check) > > > > > - /* Adjust length and check the end of data. */ > > > > > - subq $VEC_SIZE, %rsi > > > > > - jbe L(max) > > > > > -# else > > > > > - jnz L(first_vec_x0) > > > > > + /* If length < VEC_SIZE handle special. */ > > > > > + cmpq $VEC_SIZE, %rsi > > > > > + jbe L(first_vec_x0) > > > > > # endif > > > > > - > > > > > - /* Align data for aligned loads in the loop. */ > > > > > - addq $VEC_SIZE, %rdi > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > > + position of first match. */ > > > > > + testl %eax, %eax > > > > > + jz L(aligned_more) > > > > > + tzcntl %eax, %eax > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Adjust length. */ > > > > > - addq %rcx, %rsi > > > > > +L(zero): > > > > > + xorl %eax, %eax > > > > > + ret > > > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > + .p2align 4 > > > > > +L(first_vec_x0): > > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > > + and position of first match. */ > > > > > + btsq %rsi, %rax > > > > > + tzcntl %eax, %eax > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > # endif > > > > > - jmp L(more_4x_vec) > > > > > > > > > > .p2align 4 > > > > > -L(cros_page_boundary): > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > - andq $-VEC_SIZE, %rdi > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - /* Remove the leading bytes. */ > > > > > - sarl %cl, %eax > > > > > - testl %eax, %eax > > > > > - jz L(aligned_more) > > > > > +L(first_vec_x1): > > > > > tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + incl %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > - addq %rdi, %rax > > > > > - addq %rcx, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > - shrq $2, %rax > > > > > + shrl $2, %eax > > > > > # endif > > > > > -L(return_vzeroupper): > > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > + VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(aligned_more): > > > > > +L(first_vec_x2): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > > - to void possible addition overflow. */ > > > > > - negq %rcx > > > > > - addq $VEC_SIZE, %rcx > > > > > - > > > > > - /* Check the end of data. */ > > > > > - subq %rcx, %rsi > > > > > - jbe L(max) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE + 1), %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > - addq $VEC_SIZE, %rdi > > > > > + .p2align 4 > > > > > +L(first_vec_x3): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > > + addl %edi, %eax > > > > > +# endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > + .p2align 4 > > > > > +L(first_vec_x4): > > > > > + tzcntl %eax, %eax > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > + size = [1, 159]. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > + */ > > > > > + subl $(VEC_SIZE + 1), %ecx > > > > > + addl %ecx, %eax > > > > > +# else > > > > > + subl %edx, %edi > > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > > + addl %edi, %eax > > > > > # endif > > > > > +# ifdef USE_AS_WCSLEN > > > > > + shrl $2, %eax > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > > > > > > -L(more_4x_vec): > > > > > + .p2align 5 > > > > > +L(aligned_more): > > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > > + code on the x4 check. */ > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > +L(cross_page_continue): > > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > since data is only aligned to VEC_SIZE. */ > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > - > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > > + subq %rdx, %rcx > > > > > +# endif > > > > > + /* Load first VEC regardless. */ > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Adjust length. If near end handle specially. */ > > > > > + subq %rcx, %rsi > > > > > + jb L(last_4x_vec_or_less) > > > > > +# endif > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x1) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x2) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > testl %eax, %eax > > > > > jnz L(first_vec_x3) > > > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > - > > > > > -# ifdef USE_AS_STRNLEN > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > - jbe L(last_4x_vec_or_less) > > > > > -# endif > > > > > - > > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > > - movq %rdi, %rcx > > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(first_vec_x4) > > > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > > # ifdef USE_AS_STRNLEN > > > > > - /* Adjust length. */ > > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > > + jbe L(last_4x_vec_or_less_load) > > > > > + incq %rdi > > > > > + movl %edi, %ecx > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > > + /* Readjust length. */ > > > > > addq %rcx, %rsi > > > > > +# else > > > > > + incq %rdi > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > # endif > > > > > - > > > > > + /* Compare 4 * VEC at a time forward. */ > > > > > .p2align 4 > > > > > L(loop_4x_vec): > > > > > - /* Compare 4 * VEC at a time forward. */ > > > > > - vmovdqa (%rdi), %ymm1 > > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > > - > > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > - vpmovmskb %ymm5, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(4x_vec_end) > > > > > - > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > - > > > > > -# ifndef USE_AS_STRNLEN > > > > > - jmp L(loop_4x_vec) > > > > > -# else > > > > > +# ifdef USE_AS_STRNLEN > > > > > + /* Break if at end of length. */ > > > > > subq $(VEC_SIZE * 4), %rsi > > > > > - ja L(loop_4x_vec) > > > > > - > > > > > -L(last_4x_vec_or_less): > > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > - jle L(last_2x_vec) > > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > > +# endif > > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > > + */ > > > > > + vmovdqa 1(%rdi), %ymm1 > > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > > + > > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > + testl %ecx, %ecx > > > > > + jz L(loop_4x_vec) > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x1) > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + subq %rdx, %rdi > > > > > testl %eax, %eax > > > > > + jnz L(last_vec_return_x0) > > > > > > > > > > - jnz L(first_vec_x2_check) > > > > > - subl $VEC_SIZE, %esi > > > > > - jle L(max) > > > > > - > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > + vpmovmskb %ymm2, %eax > > > > > testl %eax, %eax > > > > > - > > > > > - jnz L(first_vec_x3_check) > > > > > - movq %r8, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > + jnz L(last_vec_return_x1) > > > > > + > > > > > + /* Combine last 2 VEC. */ > > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > > + vpmovmskb %ymm3, %eax > > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > > + the first 3 other VEC all did not contain a match. */ > > > > > + salq $32, %rcx > > > > > + orq %rcx, %rax > > > > > + tzcntq %rax, %rax > > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > > + addq %rdi, %rax > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > + > > > > > +# ifdef USE_AS_STRNLEN > > > > > .p2align 4 > > > > > -L(last_2x_vec): > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > +L(last_4x_vec_or_less_load): > > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > +L(last_4x_vec_or_less_cmpeq): > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > +L(last_4x_vec_or_less): > > > > > > > > > > - jnz L(first_vec_x0_check) > > > > > - subl $VEC_SIZE, %esi > > > > > - jle L(max) > > > > > + vpmovmskb %ymm1, %eax > > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > > + VEC_SIZE * 4. */ > > > > > + testl $(VEC_SIZE * 2), %esi > > > > > + jnz L(last_4x_vec) > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > + /* length may have been negative or positive by an offset of > > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > > + that. */ > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > testl %eax, %eax > > > > > - jnz L(first_vec_x1_check) > > > > > - movq %r8, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > - shrq $2, %rax > > > > > -# endif > > > > > - VZEROUPPER_RETURN > > > > > + jnz L(last_vec_x1_check) > > > > > > > > > > - .p2align 4 > > > > > -L(first_vec_x0_check): > > > > > + subl $VEC_SIZE, %esi > > > > > + jb L(max) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > tzcntl %eax, %eax > > > > > /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > +# endif > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x1_check): > > > > > +L(last_vec_return_x0): > > > > > tzcntl %eax, %eax > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $VEC_SIZE, %rax > > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x2_check): > > > > > +L(last_vec_return_x1): > > > > > tzcntl %eax, %eax > > > > > - /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > > .p2align 4 > > > > > -L(first_vec_x3_check): > > > > > +L(last_vec_x1_check): > > > > > + > > > > > tzcntl %eax, %eax > > > > > /* Check the end of data. */ > > > > > - cmpq %rax, %rsi > > > > > - jbe L(max) > > > > > - addq $(VEC_SIZE * 3), %rax > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + incl %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > - .p2align 4 > > > > > L(max): > > > > > movq %r8, %rax > > > > > + VZEROUPPER_RETURN > > > > > + > > > > > + .p2align 4 > > > > > +L(last_4x_vec): > > > > > + /* Test first 2x VEC normally. */ > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x1) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x2) > > > > > + > > > > > + /* Normalize length. */ > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + testl %eax, %eax > > > > > + jnz L(last_vec_x3) > > > > > + > > > > > + subl $(VEC_SIZE * 3), %esi > > > > > + jb L(max) > > > > > + > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + tzcntl %eax, %eax > > > > > + /* Check the end of data. */ > > > > > + cmpl %eax, %esi > > > > > + jb L(max) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > > + addq %rdi, %rax > > > > > # ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > # endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > - .p2align 4 > > > > > -L(zero): > > > > > - xorl %eax, %eax > > > > > - ret > > > > > -# endif > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x0): > > > > > +L(last_vec_x1): > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > + instructions. */ > > > > > tzcntl %eax, %eax > > > > > + subq %rdx, %rdi > > > > > + incl %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x1): > > > > > +L(last_vec_x2): > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > + instructions. */ > > > > > tzcntl %eax, %eax > > > > > - addq $VEC_SIZE, %rax > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > VZEROUPPER_RETURN > > > > > > > > > > .p2align 4 > > > > > -L(first_vec_x2): > > > > > +L(last_vec_x3): > > > > > tzcntl %eax, %eax > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > + subl $(VEC_SIZE * 2), %esi > > > > > + /* Check the end of data. */ > > > > > + cmpl %eax, %esi > > > > > + jb L(max_end) > > > > > + subq %rdx, %rdi > > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > > addq %rdi, %rax > > > > > - subq %rdx, %rax > > > > > -# ifdef USE_AS_WCSLEN > > > > > +# ifdef USE_AS_WCSLEN > > > > > shrq $2, %rax > > > > > -# endif > > > > > +# endif > > > > > + VZEROUPPER_RETURN > > > > > +L(max_end): > > > > > + movq %r8, %rax > > > > > VZEROUPPER_RETURN > > > > > +# endif > > > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > > .p2align 4 > > > > > -L(4x_vec_end): > > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > - vpmovmskb %ymm1, %eax > > > > > - testl %eax, %eax > > > > > - jnz L(first_vec_x0) > > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > - vpmovmskb %ymm2, %eax > > > > > +L(cross_page_boundary): > > > > > + /* Align data to VEC_SIZE - 1. */ > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > > + vpmovmskb %ymm1, %eax > > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > > + so no need to manually mod rdx. */ > > > > > + sarxl %edx, %eax, %eax > > > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > > is available. This causes SIGILL on some CPU. I have reported that in > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > > > This is not a bug on master as: > > > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > > Author: H.J. Lu <hjl.tools@gmail.com> > > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > > > is already in tree. The issue is the avx2 changes where backported > > > w.o H.J's changes. > > > > > > > > Regards > > > > Aurelien > > > > > > > > -- > > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > > aurelien@aurel32.net http://www.aurel32.net
On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > > > Can you post these as separate emails with the patches embedded instead of > > attached? > > > > > > > Patches are also posted on bug report 29611. > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 is there a mailing list for backport patches like this? > > > > > > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote: > > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > > > > > > mostly small things but they add up to roughly 10-30% performance > > > > > > improvement for strlen. The results for strnlen are bit more > > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > > > > > > are all passing. > > > > > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > > > --- > > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- > > > > > > sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- > > > > > > 2 files changed, 334 insertions(+), 214 deletions(-) > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > index c377cab629..651b32908e 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/strlen.c. */ > > > > > > IFUNC_IMPL (i, name, strlen, > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __strlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __strlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, strlen, > > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */ > > > > > > IFUNC_IMPL (i, name, strnlen, > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __strnlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __strnlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, strnlen, > > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/wcslen.c. */ > > > > > > IFUNC_IMPL (i, name, wcslen, > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __wcslen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __wcslen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, wcslen, > > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ > > > > > > IFUNC_IMPL (i, name, wcsnlen, > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > - CPU_FEATURE_USABLE (AVX2), > > > > > > + (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > __wcsnlen_avx2) > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > (CPU_FEATURE_USABLE (AVX2) > > > > > > + && CPU_FEATURE_USABLE (BMI2) > > > > > > && CPU_FEATURE_USABLE (RTM)), > > > > > > __wcsnlen_avx2_rtm) > > > > > > IFUNC_IMPL_ADD (array, i, wcsnlen, > > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > index 1caae9e6bc..bd2e6ee44a 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S > > > > > > @@ -27,9 +27,11 @@ > > > > > > # ifdef USE_AS_WCSLEN > > > > > > # define VPCMPEQ vpcmpeqd > > > > > > # define VPMINU vpminud > > > > > > +# define CHAR_SIZE 4 > > > > > > # else > > > > > > # define VPCMPEQ vpcmpeqb > > > > > > # define VPMINU vpminub > > > > > > +# define CHAR_SIZE 1 > > > > > > # endif > > > > > > > > > > > > # ifndef VZEROUPPER > > > > > > @@ -41,349 +43,459 @@ > > > > > > # endif > > > > > > > > > > > > # define VEC_SIZE 32 > > > > > > +# define PAGE_SIZE 4096 > > > > > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > > > ENTRY (STRLEN) > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Check for zero length. */ > > > > > > + /* Check zero length. */ > > > > > > test %RSI_LP, %RSI_LP > > > > > > jz L(zero) > > > > > > + /* Store max len in R8_LP before adjusting if using WCSLEN. */ > > > > > > + mov %RSI_LP, %R8_LP > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shl $2, %RSI_LP > > > > > > # elif defined __ILP32__ > > > > > > /* Clear the upper 32 bits. */ > > > > > > movl %esi, %esi > > > > > > # endif > > > > > > - mov %RSI_LP, %R8_LP > > > > > > # endif > > > > > > - movl %edi, %ecx > > > > > > + movl %edi, %eax > > > > > > movq %rdi, %rdx > > > > > > vpxor %xmm0, %xmm0, %xmm0 > > > > > > - > > > > > > + /* Clear high bits from edi. Only keeping bits relevant to page > > > > > > + cross check. */ > > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > > /* Check if we may cross page boundary with one vector load. */ > > > > > > - andl $(2 * VEC_SIZE - 1), %ecx > > > > > > - cmpl $VEC_SIZE, %ecx > > > > > > - ja L(cros_page_boundary) > > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > > + ja L(cross_page_boundary) > > > > > > > > > > > > /* Check the first VEC_SIZE bytes. */ > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - > > > > > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - jnz L(first_vec_x0_check) > > > > > > - /* Adjust length and check the end of data. */ > > > > > > - subq $VEC_SIZE, %rsi > > > > > > - jbe L(max) > > > > > > -# else > > > > > > - jnz L(first_vec_x0) > > > > > > + /* If length < VEC_SIZE handle special. */ > > > > > > + cmpq $VEC_SIZE, %rsi > > > > > > + jbe L(first_vec_x0) > > > > > > # endif > > > > > > - > > > > > > - /* Align data for aligned loads in the loop. */ > > > > > > - addq $VEC_SIZE, %rdi > > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > > - andq $-VEC_SIZE, %rdi > > > > > > + /* If empty continue to aligned_more. Otherwise return bit > > > > > > + position of first match. */ > > > > > > + testl %eax, %eax > > > > > > + jz L(aligned_more) > > > > > > + tzcntl %eax, %eax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Adjust length. */ > > > > > > - addq %rcx, %rsi > > > > > > +L(zero): > > > > > > + xorl %eax, %eax > > > > > > + ret > > > > > > > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > + .p2align 4 > > > > > > +L(first_vec_x0): > > > > > > + /* Set bit for max len so that tzcnt will return min of max len > > > > > > + and position of first match. */ > > > > > > + btsq %rsi, %rax > > > > > > + tzcntl %eax, %eax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > # endif > > > > > > - jmp L(more_4x_vec) > > > > > > > > > > > > .p2align 4 > > > > > > -L(cros_page_boundary): > > > > > > - andl $(VEC_SIZE - 1), %ecx > > > > > > - andq $-VEC_SIZE, %rdi > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - /* Remove the leading bytes. */ > > > > > > - sarl %cl, %eax > > > > > > - testl %eax, %eax > > > > > > - jz L(aligned_more) > > > > > > +L(first_vec_x1): > > > > > > tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 4 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + incl %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > - addq %rdi, %rax > > > > > > - addq %rcx, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > - shrq $2, %rax > > > > > > + shrl $2, %eax > > > > > > # endif > > > > > > -L(return_vzeroupper): > > > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(aligned_more): > > > > > > +L(first_vec_x2): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" > > > > > > - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" > > > > > > - to void possible addition overflow. */ > > > > > > - negq %rcx > > > > > > - addq $VEC_SIZE, %rcx > > > > > > - > > > > > > - /* Check the end of data. */ > > > > > > - subq %rcx, %rsi > > > > > > - jbe L(max) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 3 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE + 1), %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > - addq $VEC_SIZE, %rdi > > > > > > + .p2align 4 > > > > > > +L(first_vec_x3): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE * 2 + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE * 2 + 1), %edi > > > > > > + addl %edi, %eax > > > > > > +# endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > + .p2align 4 > > > > > > +L(first_vec_x4): > > > > > > + tzcntl %eax, %eax > > > > > > + /* Safe to use 32 bit instructions as these are only called for > > > > > > + size = [1, 159]. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > + /* Use ecx which was computed earlier to compute correct value. > > > > > > + */ > > > > > > + subl $(VEC_SIZE + 1), %ecx > > > > > > + addl %ecx, %eax > > > > > > +# else > > > > > > + subl %edx, %edi > > > > > > + addl $(VEC_SIZE * 3 + 1), %edi > > > > > > + addl %edi, %eax > > > > > > # endif > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > + shrl $2, %eax > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > > > > > > > -L(more_4x_vec): > > > > > > + .p2align 5 > > > > > > +L(aligned_more): > > > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > > > + instructions as using andq with -VEC_SIZE but saves 4 bytes of > > > > > > + code on the x4 check. */ > > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > > +L(cross_page_continue): > > > > > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > > > since data is only aligned to VEC_SIZE. */ > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > - > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because > > > > > > + it simplies the logic in last_4x_vec_or_less. */ > > > > > > + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx > > > > > > + subq %rdx, %rcx > > > > > > +# endif > > > > > > + /* Load first VEC regardless. */ > > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Adjust length. If near end handle specially. */ > > > > > > + subq %rcx, %rsi > > > > > > + jb L(last_4x_vec_or_less) > > > > > > +# endif > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x1) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x2) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > testl %eax, %eax > > > > > > jnz L(first_vec_x3) > > > > > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > > - > > > > > > -# ifdef USE_AS_STRNLEN > > > > > > - subq $(VEC_SIZE * 4), %rsi > > > > > > - jbe L(last_4x_vec_or_less) > > > > > > -# endif > > > > > > - > > > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > > > - movq %rdi, %rcx > > > > > > - andl $(4 * VEC_SIZE - 1), %ecx > > > > > > - andq $-(4 * VEC_SIZE), %rdi > > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(first_vec_x4) > > > > > > > > > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > > > # ifdef USE_AS_STRNLEN > > > > > > - /* Adjust length. */ > > > > > > + /* Before adjusting length check if at last VEC_SIZE * 4. */ > > > > > > + cmpq $(VEC_SIZE * 4 - 1), %rsi > > > > > > + jbe L(last_4x_vec_or_less_load) > > > > > > + incq %rdi > > > > > > + movl %edi, %ecx > > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > > + andl $(VEC_SIZE * 4 - 1), %ecx > > > > > > + /* Readjust length. */ > > > > > > addq %rcx, %rsi > > > > > > +# else > > > > > > + incq %rdi > > > > > > + orq $(VEC_SIZE * 4 - 1), %rdi > > > > > > # endif > > > > > > - > > > > > > + /* Compare 4 * VEC at a time forward. */ > > > > > > .p2align 4 > > > > > > L(loop_4x_vec): > > > > > > - /* Compare 4 * VEC at a time forward. */ > > > > > > - vmovdqa (%rdi), %ymm1 > > > > > > - vmovdqa VEC_SIZE(%rdi), %ymm2 > > > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > > > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > > > > > > - VPMINU %ymm1, %ymm2, %ymm5 > > > > > > - VPMINU %ymm3, %ymm4, %ymm6 > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > > > - > > > > > > - VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > > - vpmovmskb %ymm5, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(4x_vec_end) > > > > > > - > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > > > - > > > > > > -# ifndef USE_AS_STRNLEN > > > > > > - jmp L(loop_4x_vec) > > > > > > -# else > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > + /* Break if at end of length. */ > > > > > > subq $(VEC_SIZE * 4), %rsi > > > > > > - ja L(loop_4x_vec) > > > > > > - > > > > > > -L(last_4x_vec_or_less): > > > > > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > > - jle L(last_2x_vec) > > > > > > + jb L(last_4x_vec_or_less_cmpeq) > > > > > > +# endif > > > > > > + /* Save some code size by microfusing VPMINU with the load. Since > > > > > > + the matches in ymm2/ymm4 can only be returned if there where no > > > > > > + matches in ymm1/ymm3 respectively there is no issue with overlap. > > > > > > + */ > > > > > > + vmovdqa 1(%rdi), %ymm1 > > > > > > + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 > > > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 > > > > > > + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 > > > > > > + > > > > > > + VPMINU %ymm2, %ymm4, %ymm5 > > > > > > + VPCMPEQ %ymm5, %ymm0, %ymm5 > > > > > > + vpmovmskb %ymm5, %ecx > > > > > > > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > > + testl %ecx, %ecx > > > > > > + jz L(loop_4x_vec) > > > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x1) > > > > > > > > > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + subq %rdx, %rdi > > > > > > testl %eax, %eax > > > > > > + jnz L(last_vec_return_x0) > > > > > > > > > > > > - jnz L(first_vec_x2_check) > > > > > > - subl $VEC_SIZE, %esi > > > > > > - jle L(max) > > > > > > - > > > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > > + vpmovmskb %ymm2, %eax > > > > > > testl %eax, %eax > > > > > > - > > > > > > - jnz L(first_vec_x3_check) > > > > > > - movq %r8, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > + jnz L(last_vec_return_x1) > > > > > > + > > > > > > + /* Combine last 2 VEC. */ > > > > > > + VPCMPEQ %ymm3, %ymm0, %ymm3 > > > > > > + vpmovmskb %ymm3, %eax > > > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > > > + the first 3 other VEC all did not contain a match. */ > > > > > > + salq $32, %rcx > > > > > > + orq %rcx, %rax > > > > > > + tzcntq %rax, %rax > > > > > > + subq $(VEC_SIZE * 2 - 1), %rdi > > > > > > + addq %rdi, %rax > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > + > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > .p2align 4 > > > > > > -L(last_2x_vec): > > > > > > - addl $(VEC_SIZE * 2), %esi > > > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > +L(last_4x_vec_or_less_load): > > > > > > + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > > > +L(last_4x_vec_or_less_cmpeq): > > > > > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > > > > > +L(last_4x_vec_or_less): > > > > > > > > > > > > - jnz L(first_vec_x0_check) > > > > > > - subl $VEC_SIZE, %esi > > > > > > - jle L(max) > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + /* If remaining length > VEC_SIZE * 2. This works if esi is off by > > > > > > + VEC_SIZE * 4. */ > > > > > > + testl $(VEC_SIZE * 2), %esi > > > > > > + jnz L(last_4x_vec) > > > > > > > > > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > + /* length may have been negative or positive by an offset of > > > > > > + VEC_SIZE * 4 depending on where this was called from. This fixes > > > > > > + that. */ > > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > > testl %eax, %eax > > > > > > - jnz L(first_vec_x1_check) > > > > > > - movq %r8, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > - shrq $2, %rax > > > > > > -# endif > > > > > > - VZEROUPPER_RETURN > > > > > > + jnz L(last_vec_x1_check) > > > > > > > > > > > > - .p2align 4 > > > > > > -L(first_vec_x0_check): > > > > > > + subl $VEC_SIZE, %esi > > > > > > + jb L(max) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > tzcntl %eax, %eax > > > > > > /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > +# endif > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x1_check): > > > > > > +L(last_vec_return_x0): > > > > > > tzcntl %eax, %eax > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $VEC_SIZE, %rax > > > > > > + subq $(VEC_SIZE * 4 - 1), %rdi > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x2_check): > > > > > > +L(last_vec_return_x1): > > > > > > tzcntl %eax, %eax > > > > > > - /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > > + subq $(VEC_SIZE * 3 - 1), %rdi > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > +# ifdef USE_AS_STRNLEN > > > > > > .p2align 4 > > > > > > -L(first_vec_x3_check): > > > > > > +L(last_vec_x1_check): > > > > > > + > > > > > > tzcntl %eax, %eax > > > > > > /* Check the end of data. */ > > > > > > - cmpq %rax, %rsi > > > > > > - jbe L(max) > > > > > > - addq $(VEC_SIZE * 3), %rax > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + incl %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > - .p2align 4 > > > > > > L(max): > > > > > > movq %r8, %rax > > > > > > + VZEROUPPER_RETURN > > > > > > + > > > > > > + .p2align 4 > > > > > > +L(last_4x_vec): > > > > > > + /* Test first 2x VEC normally. */ > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x1) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x2) > > > > > > + > > > > > > + /* Normalize length. */ > > > > > > + andl $(VEC_SIZE * 4 - 1), %esi > > > > > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + testl %eax, %eax > > > > > > + jnz L(last_vec_x3) > > > > > > + > > > > > > + subl $(VEC_SIZE * 3), %esi > > > > > > + jb L(max) > > > > > > + > > > > > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + tzcntl %eax, %eax > > > > > > + /* Check the end of data. */ > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE * 3 + 1), %eax > > > > > > + addq %rdi, %rax > > > > > > # ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > # endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > - .p2align 4 > > > > > > -L(zero): > > > > > > - xorl %eax, %eax > > > > > > - ret > > > > > > -# endif > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x0): > > > > > > +L(last_vec_x1): > > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > > + instructions. */ > > > > > > tzcntl %eax, %eax > > > > > > + subq %rdx, %rdi > > > > > > + incl %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x1): > > > > > > +L(last_vec_x2): > > > > > > + /* essentially duplicates of first_vec_x1 but use 64 bit > > > > > > + instructions. */ > > > > > > tzcntl %eax, %eax > > > > > > - addq $VEC_SIZE, %rax > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > VZEROUPPER_RETURN > > > > > > > > > > > > .p2align 4 > > > > > > -L(first_vec_x2): > > > > > > +L(last_vec_x3): > > > > > > tzcntl %eax, %eax > > > > > > - addq $(VEC_SIZE * 2), %rax > > > > > > + subl $(VEC_SIZE * 2), %esi > > > > > > + /* Check the end of data. */ > > > > > > + cmpl %eax, %esi > > > > > > + jb L(max_end) > > > > > > + subq %rdx, %rdi > > > > > > + addl $(VEC_SIZE * 2 + 1), %eax > > > > > > addq %rdi, %rax > > > > > > - subq %rdx, %rax > > > > > > -# ifdef USE_AS_WCSLEN > > > > > > +# ifdef USE_AS_WCSLEN > > > > > > shrq $2, %rax > > > > > > -# endif > > > > > > +# endif > > > > > > + VZEROUPPER_RETURN > > > > > > +L(max_end): > > > > > > + movq %r8, %rax > > > > > > VZEROUPPER_RETURN > > > > > > +# endif > > > > > > > > > > > > + /* Cold case for crossing page with first load. */ > > > > > > .p2align 4 > > > > > > -L(4x_vec_end): > > > > > > - VPCMPEQ %ymm1, %ymm0, %ymm1 > > > > > > - vpmovmskb %ymm1, %eax > > > > > > - testl %eax, %eax > > > > > > - jnz L(first_vec_x0) > > > > > > - VPCMPEQ %ymm2, %ymm0, %ymm2 > > > > > > - vpmovmskb %ymm2, %eax > > > > > > +L(cross_page_boundary): > > > > > > + /* Align data to VEC_SIZE - 1. */ > > > > > > + orq $(VEC_SIZE - 1), %rdi > > > > > > + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 > > > > > > + vpmovmskb %ymm1, %eax > > > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > > > + so no need to manually mod rdx. */ > > > > > > + sarxl %edx, %eax, %eax > > > > > > > > > > This is a BMI2 instruction, which is not necessary available when AVX2 > > > > > is available. This causes SIGILL on some CPU. I have reported that in > > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > > > > > > > This is not a bug on master as: > > > > > > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359 > > > > Author: H.J. Lu <hjl.tools@gmail.com> > > > > Date: Mon Apr 19 10:45:07 2021 -0700 > > > > > > > > x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > is already in tree. The issue is the avx2 changes where backported > > > > w.o H.J's changes. > > > > > > > > > > Regards > > > > > Aurelien > > > > > > > > > > -- > > > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > > > aurelien@aurel32.net http://www.aurel32.net
On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". H.J.
On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S > > > > > > Can you post these as separate emails with the patches embedded instead of > > > attached? > > > > > > > > > > > Patches are also posted on bug report 29611. > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611 > > is there a mailing list for backport patches like this? It is libc-stable.
On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". > > > H.J. Pulling up corresponding patches are extremely difficult as they are not modular. I can modify existing patches (as posted on bug report) to incorporate ifunc-impl-list.c functionality. If it is OK? For backporting small incremental changes are preferred. Single monolithic patch makes backporting extremely difficult, if not impossible.
On Wed, Sep 28, 2022 at 12:09 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > It doesn't fully fix BZ #29611. Like Noah mentioned, we need to add > > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h". > > > > > > H.J. > > Pulling up corresponding patches are extremely difficult as they are not > modular. I can modify existing patches (as posted on bug report) to > incorporate ifunc-impl-list.c functionality. If it is OK? Please mention BZ #29611 in the commit log of the backport and submit a separate patch to fully fix BZ #29611. We should use a patch set for BZ #29611. > For backporting small incremental changes are preferred. Single monolithic > patch makes backporting extremely difficult, if not impossible.
FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>
FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano, CEO FoodserviceResults T: (708) 228-1427 darrentristano.com
On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > Attached patch fixes BZ# 29611. > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > if there is any objection. Sorry to be late on this. I have a few comments about that patch: > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > From: "H.J. Lu" <hjl.tools@gmail.com> > Date: Mon, 19 Apr 2021 10:45:07 -0700 > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > Since strchr-avx2.S updated by > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > Author: noah <goldstein.w.n@gmail.com> > Date: Wed Feb 3 00:38:59 2021 -0500 > > x86-64: Refactor and improve performance of strchr-avx2.S > > uses sarx: > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > ifunc-avx2.h. > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > --- > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > 2 files changed, 11 insertions(+), 5 deletions(-) First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got backported to 2.32 and older branches, and strchr-avx2.S in those branches do not use BMI2 instructions. So it doesn't make sense to backport it. That said the change in ifunc-avx2.h fixes: - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: Optimize memchr-avx2.S") - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: Optimize strlen-avx2.S") So the issues are fixed, but mostly by chance. NB: at this stage, I haven't verified the consistency of the ifunc selectors with ifunc-impl-list.c.
On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > Sorry to be late on this. I have a few comments about that patch: > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > From: "H.J. Lu" <hjl.tools@gmail.com> > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > Since strchr-avx2.S updated by > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > Author: noah <goldstein.w.n@gmail.com> > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > uses sarx: > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > ifunc-avx2.h. > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > --- > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > 2 files changed, 11 insertions(+), 5 deletions(-) > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > backported to 2.32 and older branches, and strchr-avx2.S in those > branches do not use BMI2 instructions. So it doesn't make sense to > backport it. > > That said the change in ifunc-avx2.h fixes: > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > Optimize memchr-avx2.S") > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > Optimize strlen-avx2.S") > > So the issues are fixed, but mostly by chance. > > NB: at this stage, I haven't verified the consistency of the ifunc > selectors with ifunc-impl-list.c. > Changes to ifunc-impl-list.c aren't strictly needed since strchr functions don't use BMI2. AVX2 strchr functions are still tested on machines with AVX2 and BMI2.
On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > Attached patch fixes BZ# 29611. > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > if there is any objection. > > Sorry to be late on this. I have a few comments about that patch: > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > From: "H.J. Lu" <hjl.tools@gmail.com> > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > Since strchr-avx2.S updated by > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > Author: noah <goldstein.w.n@gmail.com> > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > uses sarx: > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > ifunc-avx2.h. > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > --- > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > 2 files changed, 11 insertions(+), 5 deletions(-) > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > backported to 2.32 and older branches, and strchr-avx2.S in those > branches do not use BMI2 instructions. So it doesn't make sense to > backport it. > > That said the change in ifunc-avx2.h fixes: > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > Optimize memchr-avx2.S") > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > Optimize strlen-avx2.S") > > So the issues are fixed, but mostly by chance. How do you know it is a "by chance" fix, do you have any evidence to back your claim? > > NB: at this stage, I haven't verified the consistency of the ifunc > selectors with ifunc-impl-list.c. > > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net
On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > Since strchr-avx2.S updated by > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > Author: noah <goldstein.w.n@gmail.com> > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > uses sarx: > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > ifunc-avx2.h. > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > --- > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > backported to 2.32 and older branches, and strchr-avx2.S in those > > branches do not use BMI2 instructions. So it doesn't make sense to > > backport it. > > > > That said the change in ifunc-avx2.h fixes: > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > Optimize memchr-avx2.S") > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > Optimize strlen-avx2.S") > > > > So the issues are fixed, but mostly by chance. > > How do you know it is a "by chance" fix, do you have any evidence to back > your claim? There might not be evidence about the intention of the authors but clearly the strchr commit message does not clarify that it also fixes memchr/strlen. > > > > > NB: at this stage, I haven't verified the consistency of the ifunc > > selectors with ifunc-impl-list.c. > > > > -- > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > aurelien@aurel32.net http://www.aurel32.net
On Wed, Oct 5, 2022 at 7:23 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > Since strchr-avx2.S updated by > > > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > > Author: noah <goldstein.w.n@gmail.com> > > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > > > uses sarx: > > > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > > ifunc-avx2.h. > > > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > > backported to 2.32 and older branches, and strchr-avx2.S in those > > > branches do not use BMI2 instructions. So it doesn't make sense to > > > backport it. > > > > > > That said the change in ifunc-avx2.h fixes: > > > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > > Optimize memchr-avx2.S") > > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > > Optimize strlen-avx2.S") > > > > > > So the issues are fixed, but mostly by chance. > > > > How do you know it is a "by chance" fix, do you have any evidence to back > > your claim? > > There might not be evidence about the intention of the authors but clearly > the strchr commit message does not clarify that it also fixes memchr/strlen. ifunc-avx2.h header file is used in many functions, so fix in ifunc-avx2.h fixes all those functions too. It's not "by chance", I scan all the functions where ifunc-avx2.h are used before backporting it. Since this is a backport commit and no extra changes are made, there is no need to modify the original author commit message. > > > > > > > > NB: at this stage, I haven't verified the consistency of the ifunc > > > selectors with ifunc-impl-list.c. > > > > > > -- > > > Aurelien Jarno GPG: 4096R/1DDD8C9B > > > aurelien@aurel32.net http://www.aurel32.net
On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote: > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > Attached patch fixes BZ# 29611. > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > if there is any objection. > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > Since strchr-avx2.S updated by > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > Author: noah <goldstein.w.n@gmail.com> > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > uses sarx: > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > ifunc-avx2.h. > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > --- > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > backported to 2.32 and older branches, and strchr-avx2.S in those > > branches do not use BMI2 instructions. So it doesn't make sense to > > backport it. > > > > That said the change in ifunc-avx2.h fixes: > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > Optimize memchr-avx2.S") > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > Optimize strlen-avx2.S") > > > > So the issues are fixed, but mostly by chance. > > How do you know it is a "by chance" fix, do you have any evidence to back > your claim? My point is that the commit that has been backported is fixing a bug that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx instruction as the commit claims, and does not use other BMI2 instructions either. However following the backport of commit acfd088a1963 and aaa23c350715 in these branches, memchr-avx2.S and strlen-avx2.S use BMI2 instructions, and as they use ifunc-avx2.h, this actually fixes the bug.
On Wed, Oct 5, 2022 at 10:11 AM Aurelien Jarno <aurelien@aurel32.net> wrote: > > On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote: > > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote: > > > > > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote: > > > > Attached patch fixes BZ# 29611. > > > > > > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know > > > > if there is any objection. > > > > > > Sorry to be late on this. I have a few comments about that patch: > > > > > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001 > > > > From: "H.J. Lu" <hjl.tools@gmail.com> > > > > Date: Mon, 19 Apr 2021 10:45:07 -0700 > > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S > > > > > > > > Since strchr-avx2.S updated by > > > > > > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8 > > > > Author: noah <goldstein.w.n@gmail.com> > > > > Date: Wed Feb 3 00:38:59 2021 -0500 > > > > > > > > x86-64: Refactor and improve performance of strchr-avx2.S > > > > > > > > uses sarx: > > > > > > > > c4 e2 72 f7 c0 sarx %ecx,%eax,%eax > > > > > > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and > > > > ifunc-avx2.h. > > > > > > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359) > > > > --- > > > > sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- > > > > 2 files changed, 11 insertions(+), 5 deletions(-) > > > > > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got > > > backported to 2.32 and older branches, and strchr-avx2.S in those > > > branches do not use BMI2 instructions. So it doesn't make sense to > > > backport it. > > > > > > That said the change in ifunc-avx2.h fixes: > > > > > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86: > > > Optimize memchr-avx2.S") > > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86: > > > Optimize strlen-avx2.S") > > > > > > So the issues are fixed, but mostly by chance. > > > > How do you know it is a "by chance" fix, do you have any evidence to back > > your claim? > > My point is that the commit that has been backported is fixing a bug > that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx > instruction as the commit claims, and does not use other BMI2 > instructions either. > > However following the backport of commit acfd088a1963 and aaa23c350715 > in these branches, memchr-avx2.S and strlen-avx2.S use BMI2 > instructions, and as they use ifunc-avx2.h, this actually fixes the bug. > This patch got selected because it fixes the ifunc-avx2.h file. My preference is to take an existing patch if possible, rather than creating a new one for branches. You are right, the original patch should have been composed differently to make it crystal clear. For backporting it's preferable to have small independent patches with logical grouping. > -- > Aurelien Jarno GPG: 4096R/1DDD8C9B > aurelien@aurel32.net http://www.aurel32.net
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c377cab629..651b32908e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strlen.c. */ IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strlen_avx2) IFUNC_IMPL_ADD (array, i, strlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strlen, @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strnlen.c. */ IFUNC_IMPL (i, name, strnlen, IFUNC_IMPL_ADD (array, i, strnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __strnlen_avx2) IFUNC_IMPL_ADD (array, i, strnlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __strnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strnlen, @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcslen.c. */ IFUNC_IMPL (i, name, wcslen, IFUNC_IMPL_ADD (array, i, wcslen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcslen_avx2) IFUNC_IMPL_ADD (array, i, wcslen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __wcslen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcslen, @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ IFUNC_IMPL (i, name, wcsnlen, IFUNC_IMPL_ADD (array, i, wcsnlen, - CPU_FEATURE_USABLE (AVX2), + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2)), __wcsnlen_avx2) IFUNC_IMPL_ADD (array, i, wcsnlen, (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (RTM)), __wcsnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsnlen, diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index 1caae9e6bc..bd2e6ee44a 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -27,9 +27,11 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd # define VPMINU vpminud +# define CHAR_SIZE 4 # else # define VPCMPEQ vpcmpeqb # define VPMINU vpminub +# define CHAR_SIZE 1 # endif # ifndef VZEROUPPER @@ -41,349 +43,459 @@ # endif # define VEC_SIZE 32 +# define PAGE_SIZE 4096 .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN - /* Check for zero length. */ + /* Check zero length. */ test %RSI_LP, %RSI_LP jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ + mov %RSI_LP, %R8_LP # ifdef USE_AS_WCSLEN shl $2, %RSI_LP # elif defined __ILP32__ /* Clear the upper 32 bits. */ movl %esi, %esi # endif - mov %RSI_LP, %R8_LP # endif - movl %edi, %ecx + movl %edi, %eax movq %rdi, %rdx vpxor %xmm0, %xmm0, %xmm0 - + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. */ + andl $(PAGE_SIZE - 1), %eax /* Check if we may cross page boundary with one vector load. */ - andl $(2 * VEC_SIZE - 1), %ecx - cmpl $VEC_SIZE, %ecx - ja L(cros_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - + VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax # ifdef USE_AS_STRNLEN - jnz L(first_vec_x0_check) - /* Adjust length and check the end of data. */ - subq $VEC_SIZE, %rsi - jbe L(max) -# else - jnz L(first_vec_x0) + /* If length < VEC_SIZE handle special. */ + cmpq $VEC_SIZE, %rsi + jbe L(first_vec_x0) # endif - - /* Align data for aligned loads in the loop. */ - addq $VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi + /* If empty continue to aligned_more. Otherwise return bit + position of first match. */ + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # ifdef USE_AS_STRNLEN - /* Adjust length. */ - addq %rcx, %rsi +L(zero): + xorl %eax, %eax + ret - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + .p2align 4 +L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ + btsq %rsi, %rax + tzcntl %eax, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN # endif - jmp L(more_4x_vec) .p2align 4 -L(cros_page_boundary): - andl $(VEC_SIZE - 1), %ecx - andq $-VEC_SIZE, %rdi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bytes. */ - sarl %cl, %eax - testl %eax, %eax - jz L(aligned_more) +L(first_vec_x1): tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + incl %edi + addl %edi, %eax # endif - addq %rdi, %rax - addq %rcx, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN - shrq $2, %rax + shrl $2, %eax # endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN + VZEROUPPER_RETURN .p2align 4 -L(aligned_more): +L(first_vec_x2): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" - with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" - to void possible addition overflow. */ - negq %rcx - addq $VEC_SIZE, %rcx - - /* Check the end of data. */ - subq %rcx, %rsi - jbe L(max) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN - addq $VEC_SIZE, %rdi + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ +# ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax +# endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN + .p2align 4 +L(first_vec_x4): + tzcntl %eax, %eax + /* Safe to use 32 bit instructions as these are only called for + size = [1, 159]. */ # ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) + /* Use ecx which was computed earlier to compute correct value. + */ + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax +# else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax # endif +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif + VZEROUPPER_RETURN -L(more_4x_vec): + .p2align 5 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq with -VEC_SIZE but saves 4 bytes of + code on the x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time since data is only aligned to VEC_SIZE. */ - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax +# ifdef USE_AS_STRNLEN + /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because + it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx +# endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +# ifdef USE_AS_STRNLEN + /* Adjust length. If near end handle specially. */ + subq %rcx, %rsi + jb L(last_4x_vec_or_less) +# endif + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax jnz L(first_vec_x3) - addq $(VEC_SIZE * 4), %rdi - -# ifdef USE_AS_STRNLEN - subq $(VEC_SIZE * 4), %rsi - jbe L(last_4x_vec_or_less) -# endif - - /* Align data to 4 * VEC_SIZE. */ - movq %rdi, %rcx - andl $(4 * VEC_SIZE - 1), %ecx - andq $-(4 * VEC_SIZE), %rdi + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ # ifdef USE_AS_STRNLEN - /* Adjust length. */ + /* Before adjusting length check if at last VEC_SIZE * 4. */ + cmpq $(VEC_SIZE * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx + /* Readjust length. */ addq %rcx, %rsi +# else + incq %rdi + orq $(VEC_SIZE * 4 - 1), %rdi # endif - + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ - vmovdqa (%rdi), %ymm1 - vmovdqa VEC_SIZE(%rdi), %ymm2 - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 - VPMINU %ymm1, %ymm2, %ymm5 - VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 - - VPCMPEQ %ymm5, %ymm0, %ymm5 - vpmovmskb %ymm5, %eax - testl %eax, %eax - jnz L(4x_vec_end) - - addq $(VEC_SIZE * 4), %rdi - -# ifndef USE_AS_STRNLEN - jmp L(loop_4x_vec) -# else +# ifdef USE_AS_STRNLEN + /* Break if at end of length. */ subq $(VEC_SIZE * 4), %rsi - ja L(loop_4x_vec) - -L(last_4x_vec_or_less): - /* Less than 4 * VEC and aligned to VEC_SIZE. */ - addl $(VEC_SIZE * 2), %esi - jle L(last_2x_vec) + jb L(last_4x_vec_or_less_cmpeq) +# endif + /* Save some code size by microfusing VPMINU with the load. Since + the matches in ymm2/ymm4 can only be returned if there where no + matches in ymm1/ymm3 respectively there is no issue with overlap. + */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 + VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 + + VPMINU %ymm2, %ymm4, %ymm5 + VPCMPEQ %ymm5, %ymm0, %ymm5 + vpmovmskb %ymm5, %ecx - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm1, %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + subq %rdx, %rdi testl %eax, %eax + jnz L(last_vec_return_x0) - jnz L(first_vec_x2_check) - subl $VEC_SIZE, %esi - jle L(max) - - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm2, %ymm0, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax - - jnz L(first_vec_x3_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN + jnz L(last_vec_return_x1) + + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used if + the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN + +# ifdef USE_AS_STRNLEN .p2align 4 -L(last_2x_vec): - addl $(VEC_SIZE * 2), %esi - VPCMPEQ (%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax +L(last_4x_vec_or_less_load): + /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ + subq $-(VEC_SIZE * 4), %rdi +L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +L(last_4x_vec_or_less): - jnz L(first_vec_x0_check) - subl $VEC_SIZE, %esi - jle L(max) + vpmovmskb %ymm1, %eax + /* If remaining length > VEC_SIZE * 2. This works if esi is off by + VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 - vpmovmskb %ymm1, %eax + /* length may have been negative or positive by an offset of + VEC_SIZE * 4 depending on where this was called from. This fixes + that. */ + andl $(VEC_SIZE * 4 - 1), %esi testl %eax, %eax - jnz L(first_vec_x1_check) - movq %r8, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - VZEROUPPER_RETURN + jnz L(last_vec_x1_check) - .p2align 4 -L(first_vec_x0_check): + subl $VEC_SIZE, %esi + jb L(max) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN +# endif .p2align 4 -L(first_vec_x1_check): +L(last_vec_return_x0): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $VEC_SIZE, %rax + subq $(VEC_SIZE * 4 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2_check): +L(last_vec_return_x1): tzcntl %eax, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 2), %rax + subq $(VEC_SIZE * 3 - 1), %rdi addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN +# ifdef USE_AS_STRNLEN .p2align 4 -L(first_vec_x3_check): +L(last_vec_x1_check): + tzcntl %eax, %eax /* Check the end of data. */ - cmpq %rax, %rsi - jbe L(max) - addq $(VEC_SIZE * 3), %rax + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN - .p2align 4 L(max): movq %r8, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(last_4x_vec): + /* Test first 2x VEC normally. */ + testl %eax, %eax + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + subl $(VEC_SIZE * 3), %esi + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max) + subq %rdx, %rdi + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax # ifdef USE_AS_WCSLEN shrq $2, %rax # endif VZEROUPPER_RETURN - .p2align 4 -L(zero): - xorl %eax, %eax - ret -# endif .p2align 4 -L(first_vec_x0): +L(last_vec_x1): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax + subq %rdx, %rdi + incl %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x1): +L(last_vec_x2): + /* essentially duplicates of first_vec_x1 but use 64 bit + instructions. */ tzcntl %eax, %eax - addq $VEC_SIZE, %rax + subq %rdx, %rdi + addl $(VEC_SIZE + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif VZEROUPPER_RETURN .p2align 4 -L(first_vec_x2): +L(last_vec_x3): tzcntl %eax, %eax - addq $(VEC_SIZE * 2), %rax + subl $(VEC_SIZE * 2), %esi + /* Check the end of data. */ + cmpl %eax, %esi + jb L(max_end) + subq %rdx, %rdi + addl $(VEC_SIZE * 2 + 1), %eax addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN +# ifdef USE_AS_WCSLEN shrq $2, %rax -# endif +# endif + VZEROUPPER_RETURN +L(max_end): + movq %r8, %rax VZEROUPPER_RETURN +# endif + /* Cold case for crossing page with first load. */ .p2align 4 -L(4x_vec_end): - VPCMPEQ %ymm1, %ymm0, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - VPCMPEQ %ymm2, %ymm0, %ymm2 - vpmovmskb %ymm2, %eax +L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +# ifdef USE_AS_STRNLEN testl %eax, %eax - jnz L(first_vec_x1) - VPCMPEQ %ymm3, %ymm0, %ymm3 - vpmovmskb %ymm3, %eax + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) + movq %r8, %rax +# else testl %eax, %eax - jnz L(first_vec_x2) - VPCMPEQ %ymm4, %ymm0, %ymm4 - vpmovmskb %ymm4, %eax -L(first_vec_x3): + jz L(cross_page_continue) tzcntl %eax, %eax - addq $(VEC_SIZE * 3), %rax - addq %rdi, %rax - subq %rdx, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif # endif +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +# ifdef USE_AS_STRNLEN + .p2align 4 +L(cross_page_less_vec): + tzcntl %eax, %eax + cmpq %rax, %rsi + cmovb %esi, %eax +# ifdef USE_AS_WCSLEN + shrl $2, %eax +# endif VZEROUPPER_RETURN +# endif END (STRLEN) #endif
No bug. This commit optimizes strlen-avx2.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- 2 files changed, 334 insertions(+), 214 deletions(-)