Message ID | 20210422180403.422364-1-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/2] x86: Optimize strchr-avx2.S | expand |
On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > small things such as save an ALU in the alignment process, saving a > few instructions in the loop return, saving some bytes in the main > loop, and increasing the ILP in the return cases. test-strchr, > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > 1 file changed, 173 insertions(+), 121 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > index 25bec38b5d..220165d2ba 100644 > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > @@ -49,132 +49,144 @@ > > .section SECTION(.text),"ax",@progbits > ENTRY (STRCHR) > - movl %edi, %ecx > -# ifndef USE_AS_STRCHRNUL > - xorl %edx, %edx > -# endif > - > /* Broadcast CHAR to YMM0. */ > vmovd %esi, %xmm0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + VPBROADCAST %xmm0, %ymm0 > vpxor %xmm9, %xmm9, %xmm9 > - VPBROADCAST %xmm0, %ymm0 > > /* Check if we cross page boundary with one vector load. */ > - andl $(PAGE_SIZE - 1), %ecx > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > - ja L(cross_page_boundary) > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > null byte. */ > vmovdqu (%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > + vpmovmskb %ymm1, %eax This change isn't needed. > testl %eax, %eax > - jz L(more_vecs) > + jz L(aligned_more) > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > - addq %rdi, %rax > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero) > # endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > - > - .p2align 4 > -L(more_vecs): > - /* Align data for aligned loads in the loop. */ > - andq $-VEC_SIZE, %rdi > -L(aligned_more): > - > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - vmovdqa VEC_SIZE(%rdi), %ymm8 > - addq $VEC_SIZE, %rdi > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > - vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - vmovdqa VEC_SIZE(%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > - vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > - > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > - vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > - > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > - vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jz L(prep_loop_4x) > + addq %rdi, %rax > + VZEROUPPER_RETURN > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > + alignment % 32 was either 16 or 0. As well this makes the > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > + easier. */ > + .p2align 5 > +L(first_vec_x4): > tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > + addq $(VEC_SIZE * 3 + 1), %rdi > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero) > # endif > + addq %rdi, %rax > VZEROUPPER_RETURN > > - .p2align 4 > -L(first_vec_x0): > - tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > - addq %rdi, %rax > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > -# endif > +L(zero): > + xorl %eax, %eax > VZEROUPPER_RETURN > +# endif > + > > .p2align 4 > L(first_vec_x1): > tzcntl %eax, %eax > - leaq VEC_SIZE(%rdi, %rax), %rax > + incq %rdi > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero) > # endif > + addq %rdi, %rax > VZEROUPPER_RETURN > > .p2align 4 > L(first_vec_x2): > tzcntl %eax, %eax > + addq $(VEC_SIZE + 1), %rdi > +# ifndef USE_AS_STRCHRNUL > /* Found CHAR or the null byte. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero) > +# endif > + addq %rdi, %rax > + VZEROUPPER_RETURN > + > + .p2align 4 > +L(first_vec_x3): > + tzcntl %eax, %eax > + addq $(VEC_SIZE * 2 + 1), %rdi > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero) > # endif > + addq %rdi, %rax > VZEROUPPER_RETURN > > -L(prep_loop_4x): > - /* Align data to 4 * VEC_SIZE. */ > - andq $-(VEC_SIZE * 4), %rdi > + .p2align 4 > +L(aligned_more): > + /* Align data to VEC_SIZE - 1. This is the same number of > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > + x4 check. */ > + orq $(VEC_SIZE - 1), %rdi > +L(cross_page_continue): > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > + data is only aligned to VEC_SIZE. */ > + vmovdqa 1(%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > + vpor %ymm1, %ymm2, %ymm1 > + vpmovmskb %ymm1, %eax Use a space, not tab, after vpmovmskb. > + testl %eax, %eax > + jnz L(first_vec_x1) > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > + vpor %ymm1, %ymm2, %ymm1 > + vpmovmskb %ymm1, %eax Use a space, not tab, after vpmovmskb. > + testl %eax, %eax > + jnz L(first_vec_x2) > + > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > + vpor %ymm1, %ymm2, %ymm1 > + vpmovmskb %ymm1, %eax Use a space, not tab, after vpmovmskb. > + testl %eax, %eax > + jnz L(first_vec_x3) > + > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > + vpor %ymm1, %ymm2, %ymm1 > + vpmovmskb %ymm1, %eax Use a space, not tab, after vpmovmskb. > + testl %eax, %eax > + jnz L(first_vec_x4) > + /* Align data to VEC_SIZE * 4 - 1. */ > + addq $(VEC_SIZE * 4 + 1), %rdi > + andq $-(VEC_SIZE * 4), %rdi > .p2align 4 > L(loop_4x_vec): > /* Compare 4 * VEC at a time forward. */ > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > + vmovdqa (%rdi), %ymm5 > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > /* Leaves only CHARS matching esi as 0. */ > vpxor %ymm5, %ymm0, %ymm1 > @@ -190,62 +202,102 @@ L(loop_4x_vec): > VPMINU %ymm1, %ymm2, %ymm5 > VPMINU %ymm3, %ymm4, %ymm6 > > - VPMINU %ymm5, %ymm6, %ymm5 > + VPMINU %ymm5, %ymm6, %ymm6 > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > - vpmovmskb %ymm5, %eax > + VPCMPEQ %ymm6, %ymm9, %ymm6 > + vpmovmskb %ymm6, %ecx Use a space, not tab, after vpmovmskb. > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > + jz L(loop_4x_vec) > > - addq $(VEC_SIZE * 4), %rdi > - testl %eax, %eax > - jz L(loop_4x_vec) > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > - vpmovmskb %ymm1, %eax > + VPCMPEQ %ymm1, %ymm9, %ymm1 > + vpmovmskb %ymm1, %eax Use a space, not tab, after vpmovmskb. > testl %eax, %eax > - jnz L(first_vec_x0) > + jnz L(last_vec_x0) > + > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > - vpmovmskb %ymm2, %eax > + VPCMPEQ %ymm5, %ymm9, %ymm2 > + vpmovmskb %ymm2, %eax Use a space, not tab, after vpmovmskb. > testl %eax, %eax > - jnz L(first_vec_x1) > + jnz L(last_vec_x1) > + > + VPCMPEQ %ymm3, %ymm9, %ymm3 > + vpmovmskb %ymm3, %eax > + /* rcx has combined result from all 4 VEC. It will only be used if > + the first 3 other VEC all did not contain a match. */ > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > + subq $(VEC_SIZE * 2), %rdi > +# ifndef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero_end) > +# endif > + addq %rdi, %rax > + VZEROUPPER_RETURN > + > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > - VPCMPEQ %ymm4, %ymm9, %ymm4 > - vpmovmskb %ymm3, %ecx > - vpmovmskb %ymm4, %eax > - salq $32, %rax > - orq %rcx, %rax > - tzcntq %rax, %rax > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > + .p2align 4 > +L(last_vec_x0): > + tzcntl %eax, %eax > + addq $-(VEC_SIZE * 4), %rdi > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero_end) > # endif > + addq %rdi, %rax > VZEROUPPER_RETURN > > +# ifndef USE_AS_STRCHRNUL > +L(zero_end): > + xorl %eax, %eax > + VZEROUPPER_RETURN > +# endif > + > + .p2align 4 > +L(last_vec_x1): > + tzcntl %eax, %eax > + subq $(VEC_SIZE * 3), %rdi > +# ifndef USE_AS_STRCHRNUL > + /* Found CHAR or the null byte. */ > + cmp (%rdi, %rax), %CHAR_REG > + jne L(zero_end) > +# endif > + addq %rdi, %rax > + VZEROUPPER_RETURN > + > + > /* Cold case for crossing page with first load. */ > .p2align 4 > L(cross_page_boundary): > - andq $-VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - > - vmovdqa (%rdi), %ymm8 > - VPCMPEQ %ymm8, %ymm0, %ymm1 > - VPCMPEQ %ymm8, %ymm9, %ymm2 > + movq %rdi, %rdx > + /* Align rdi to VEC_SIZE - 1. */ > + orq $(VEC_SIZE - 1), %rdi > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > + VPCMPEQ %ymm8, %ymm0, %ymm1 > + VPCMPEQ %ymm8, %ymm9, %ymm2 > vpor %ymm1, %ymm2, %ymm1 > - vpmovmskb %ymm1, %eax > - /* Remove the leading bits. */ > - sarxl %ecx, %eax, %eax > + vpmovmskb %ymm1, %eax > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > + so no need to manually mod edx. */ > + sarxl %edx, %eax, %eax > testl %eax, %eax > - jz L(aligned_more) > + jz L(cross_page_continue) > tzcntl %eax, %eax > - addq %rcx, %rdi > - addq %rdi, %rax > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + xorl %ecx, %ecx > + /* Found CHAR or the null byte. */ > + cmp (%rdx, %rax), %CHAR_REG > + leaq (%rdx, %rax), %rax > + cmovne %rcx, %rax > +# else > + addq %rdx, %rax > # endif > - VZEROUPPER_RETURN > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > > END (STRCHR) > # endif > -- > 2.29.2 > Thanks. H.J.
On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > small things such as save an ALU in the alignment process, saving a > > few instructions in the loop return, saving some bytes in the main > > loop, and increasing the ILP in the return cases. test-strchr, > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > index 25bec38b5d..220165d2ba 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > @@ -49,132 +49,144 @@ > > > > .section SECTION(.text),"ax",@progbits > > ENTRY (STRCHR) > > - movl %edi, %ecx > > -# ifndef USE_AS_STRCHRNUL > > - xorl %edx, %edx > > -# endif > > - > > /* Broadcast CHAR to YMM0. */ > > vmovd %esi, %xmm0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + VPBROADCAST %xmm0, %ymm0 > > vpxor %xmm9, %xmm9, %xmm9 > > - VPBROADCAST %xmm0, %ymm0 > > > > /* Check if we cross page boundary with one vector load. */ > > - andl $(PAGE_SIZE - 1), %ecx > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > - ja L(cross_page_boundary) > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > null byte. */ > > vmovdqu (%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > + vpmovmskb %ymm1, %eax > > This change isn't needed. Fixed. For the future what is the rule for when to have a tab after the instructions vs just a space? > > > testl %eax, %eax > > - jz L(more_vecs) > > + jz L(aligned_more) > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > -L(return_vzeroupper): > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > - > > - .p2align 4 > > -L(more_vecs): > > - /* Align data for aligned loads in the loop. */ > > - andq $-VEC_SIZE, %rdi > > -L(aligned_more): > > - > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > - addq $VEC_SIZE, %rdi > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > - > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > - vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jz L(prep_loop_4x) > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > + alignment % 32 was either 16 or 0. As well this makes the > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > + easier. */ > > + .p2align 5 > > +L(first_vec_x4): > > tzcntl %eax, %eax > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > + addq $(VEC_SIZE * 3 + 1), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > - .p2align 4 > > -L(first_vec_x0): > > - tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > -# endif > > +L(zero): > > + xorl %eax, %eax > > VZEROUPPER_RETURN > > +# endif > > + > > > > .p2align 4 > > L(first_vec_x1): > > tzcntl %eax, %eax > > - leaq VEC_SIZE(%rdi, %rax), %rax > > + incq %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > .p2align 4 > > L(first_vec_x2): > > tzcntl %eax, %eax > > + addq $(VEC_SIZE + 1), %rdi > > +# ifndef USE_AS_STRCHRNUL > > /* Found CHAR or the null byte. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > +L(first_vec_x3): > > + tzcntl %eax, %eax > > + addq $(VEC_SIZE * 2 + 1), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > -L(prep_loop_4x): > > - /* Align data to 4 * VEC_SIZE. */ > > - andq $-(VEC_SIZE * 4), %rdi > > + .p2align 4 > > +L(aligned_more): > > + /* Align data to VEC_SIZE - 1. This is the same number of > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > + x4 check. */ > > + orq $(VEC_SIZE - 1), %rdi > > +L(cross_page_continue): > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > + data is only aligned to VEC_SIZE. */ > > + vmovdqa 1(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. Fixed. > > > + testl %eax, %eax > > + jnz L(first_vec_x1) > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x2) > > + > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x3) > > + > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > + vpor %ymm1, %ymm2, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > + testl %eax, %eax > > + jnz L(first_vec_x4) > > + /* Align data to VEC_SIZE * 4 - 1. */ > > + addq $(VEC_SIZE * 4 + 1), %rdi > > + andq $-(VEC_SIZE * 4), %rdi > > .p2align 4 > > L(loop_4x_vec): > > /* Compare 4 * VEC at a time forward. */ > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > + vmovdqa (%rdi), %ymm5 > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > /* Leaves only CHARS matching esi as 0. */ > > vpxor %ymm5, %ymm0, %ymm1 > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > VPMINU %ymm1, %ymm2, %ymm5 > > VPMINU %ymm3, %ymm4, %ymm6 > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > - vpmovmskb %ymm5, %eax > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > + vpmovmskb %ymm6, %ecx > > Use a space, not tab, after vpmovmskb. > Fixed. > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > + jz L(loop_4x_vec) > > > > - addq $(VEC_SIZE * 4), %rdi > > - testl %eax, %eax > > - jz L(loop_4x_vec) > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > - vpmovmskb %ymm1, %eax > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > + vpmovmskb %ymm1, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > testl %eax, %eax > > - jnz L(first_vec_x0) > > + jnz L(last_vec_x0) > > + > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > - vpmovmskb %ymm2, %eax > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > + vpmovmskb %ymm2, %eax > > Use a space, not tab, after vpmovmskb. > Fixed. > > testl %eax, %eax > > - jnz L(first_vec_x1) > > + jnz L(last_vec_x1) > > + > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > + vpmovmskb %ymm3, %eax > > + /* rcx has combined result from all 4 VEC. It will only be used if > > + the first 3 other VEC all did not contain a match. */ > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > + subq $(VEC_SIZE * 2), %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > - vpmovmskb %ymm3, %ecx > > - vpmovmskb %ymm4, %eax > > - salq $32, %rax > > - orq %rcx, %rax > > - tzcntq %rax, %rax > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + .p2align 4 > > +L(last_vec_x0): > > + tzcntl %eax, %eax > > + addq $-(VEC_SIZE * 4), %rdi > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > # endif > > + addq %rdi, %rax > > VZEROUPPER_RETURN > > > > +# ifndef USE_AS_STRCHRNUL > > +L(zero_end): > > + xorl %eax, %eax > > + VZEROUPPER_RETURN > > +# endif > > + > > + .p2align 4 > > +L(last_vec_x1): > > + tzcntl %eax, %eax > > + subq $(VEC_SIZE * 3), %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Found CHAR or the null byte. */ > > + cmp (%rdi, %rax), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > + > > /* Cold case for crossing page with first load. */ > > .p2align 4 > > L(cross_page_boundary): > > - andq $-VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - > > - vmovdqa (%rdi), %ymm8 > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > + movq %rdi, %rdx > > + /* Align rdi to VEC_SIZE - 1. */ > > + orq $(VEC_SIZE - 1), %rdi > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > vpor %ymm1, %ymm2, %ymm1 > > - vpmovmskb %ymm1, %eax > > - /* Remove the leading bits. */ > > - sarxl %ecx, %eax, %eax > > + vpmovmskb %ymm1, %eax > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > + so no need to manually mod edx. */ > > + sarxl %edx, %eax, %eax > > testl %eax, %eax > > - jz L(aligned_more) > > + jz L(cross_page_continue) > > tzcntl %eax, %eax > > - addq %rcx, %rdi > > - addq %rdi, %rax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + xorl %ecx, %ecx > > + /* Found CHAR or the null byte. */ > > + cmp (%rdx, %rax), %CHAR_REG > > + leaq (%rdx, %rax), %rax > > + cmovne %rcx, %rax > > +# else > > + addq %rdx, %rax > > # endif > > - VZEROUPPER_RETURN > > +L(return_vzeroupper): > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > END (STRCHR) > > # endif > > -- > > 2.29.2 > > > > Thanks. > > H.J.
On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > > small things such as save an ALU in the alignment process, saving a > > > few instructions in the loop return, saving some bytes in the main > > > loop, and increasing the ILP in the return cases. test-strchr, > > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > --- > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > index 25bec38b5d..220165d2ba 100644 > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > @@ -49,132 +49,144 @@ > > > > > > .section SECTION(.text),"ax",@progbits > > > ENTRY (STRCHR) > > > - movl %edi, %ecx > > > -# ifndef USE_AS_STRCHRNUL > > > - xorl %edx, %edx > > > -# endif > > > - > > > /* Broadcast CHAR to YMM0. */ > > > vmovd %esi, %xmm0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + VPBROADCAST %xmm0, %ymm0 > > > vpxor %xmm9, %xmm9, %xmm9 > > > - VPBROADCAST %xmm0, %ymm0 > > > > > > /* Check if we cross page boundary with one vector load. */ > > > - andl $(PAGE_SIZE - 1), %ecx > > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > - ja L(cross_page_boundary) > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(cross_page_boundary) > > > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > null byte. */ > > > vmovdqu (%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + vpmovmskb %ymm1, %eax > > > > This change isn't needed. > > Fixed. For the future what is the rule for when to have a tab after > the instructions vs just a space? I prefer if the length of <INSN> is less than a tab, <TAB><INSN><TAB><OPERANDS> Otherwise <TAB><INSN><SPACE><OPERANDS> > > > > > testl %eax, %eax > > > - jz L(more_vecs) > > > + jz L(aligned_more) > > > tzcntl %eax, %eax > > > - /* Found CHAR or the null byte. */ > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > -L(return_vzeroupper): > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > - > > > - .p2align 4 > > > -L(more_vecs): > > > - /* Align data for aligned loads in the loop. */ > > > - andq $-VEC_SIZE, %rdi > > > -L(aligned_more): > > > - > > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > - since data is only aligned to VEC_SIZE. */ > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > - addq $VEC_SIZE, %rdi > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x0) > > > - > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x1) > > > - > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jnz L(first_vec_x2) > > > - > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > - vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - testl %eax, %eax > > > - jz L(prep_loop_4x) > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > > + alignment % 32 was either 16 or 0. As well this makes the > > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > > + easier. */ > > > + .p2align 5 > > > +L(first_vec_x4): > > > tzcntl %eax, %eax > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > + addq $(VEC_SIZE * 3 + 1), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > - .p2align 4 > > > -L(first_vec_x0): > > > - tzcntl %eax, %eax > > > - /* Found CHAR or the null byte. */ > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > -# endif > > > +L(zero): > > > + xorl %eax, %eax > > > VZEROUPPER_RETURN > > > +# endif > > > + > > > > > > .p2align 4 > > > L(first_vec_x1): > > > tzcntl %eax, %eax > > > - leaq VEC_SIZE(%rdi, %rax), %rax > > > + incq %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > .p2align 4 > > > L(first_vec_x2): > > > tzcntl %eax, %eax > > > + addq $(VEC_SIZE + 1), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > /* Found CHAR or the null byte. */ > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > + .p2align 4 > > > +L(first_vec_x3): > > > + tzcntl %eax, %eax > > > + addq $(VEC_SIZE * 2 + 1), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > -L(prep_loop_4x): > > > - /* Align data to 4 * VEC_SIZE. */ > > > - andq $-(VEC_SIZE * 4), %rdi > > > + .p2align 4 > > > +L(aligned_more): > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > > + x4 check. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > +L(cross_page_continue): > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > > + data is only aligned to VEC_SIZE. */ > > > + vmovdqa 1(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > Fixed. > > > > > > + testl %eax, %eax > > > + jnz L(first_vec_x1) > > > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x2) > > > + > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x3) > > > + > > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + vpor %ymm1, %ymm2, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + testl %eax, %eax > > > + jnz L(first_vec_x4) > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > + addq $(VEC_SIZE * 4 + 1), %rdi > > > + andq $-(VEC_SIZE * 4), %rdi > > > .p2align 4 > > > L(loop_4x_vec): > > > /* Compare 4 * VEC at a time forward. */ > > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > > + vmovdqa (%rdi), %ymm5 > > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > > /* Leaves only CHARS matching esi as 0. */ > > > vpxor %ymm5, %ymm0, %ymm1 > > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > > VPMINU %ymm1, %ymm2, %ymm5 > > > VPMINU %ymm3, %ymm4, %ymm6 > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > > - vpmovmskb %ymm5, %eax > > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > > + vpmovmskb %ymm6, %ecx > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > + subq $-(VEC_SIZE * 4), %rdi > > > + testl %ecx, %ecx > > > + jz L(loop_4x_vec) > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > - testl %eax, %eax > > > - jz L(loop_4x_vec) > > > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > > + vpmovmskb %ymm1, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > testl %eax, %eax > > > - jnz L(first_vec_x0) > > > + jnz L(last_vec_x0) > > > + > > > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > > - vpmovmskb %ymm2, %eax > > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > > + vpmovmskb %ymm2, %eax > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > testl %eax, %eax > > > - jnz L(first_vec_x1) > > > + jnz L(last_vec_x1) > > > + > > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > > + vpmovmskb %ymm3, %eax > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > + the first 3 other VEC all did not contain a match. */ > > > + salq $32, %rcx > > > + orq %rcx, %rax > > > + tzcntq %rax, %rax > > > + subq $(VEC_SIZE * 2), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > > - vpmovmskb %ymm3, %ecx > > > - vpmovmskb %ymm4, %eax > > > - salq $32, %rax > > > - orq %rcx, %rax > > > - tzcntq %rax, %rax > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > + .p2align 4 > > > +L(last_vec_x0): > > > + tzcntl %eax, %eax > > > + addq $-(VEC_SIZE * 4), %rdi > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > # endif > > > + addq %rdi, %rax > > > VZEROUPPER_RETURN > > > > > > +# ifndef USE_AS_STRCHRNUL > > > +L(zero_end): > > > + xorl %eax, %eax > > > + VZEROUPPER_RETURN > > > +# endif > > > + > > > + .p2align 4 > > > +L(last_vec_x1): > > > + tzcntl %eax, %eax > > > + subq $(VEC_SIZE * 3), %rdi > > > +# ifndef USE_AS_STRCHRNUL > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdi, %rax), %CHAR_REG > > > + jne L(zero_end) > > > +# endif > > > + addq %rdi, %rax > > > + VZEROUPPER_RETURN > > > + > > > + > > > /* Cold case for crossing page with first load. */ > > > .p2align 4 > > > L(cross_page_boundary): > > > - andq $-VEC_SIZE, %rdi > > > - andl $(VEC_SIZE - 1), %ecx > > > - > > > - vmovdqa (%rdi), %ymm8 > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > + movq %rdi, %rdx > > > + /* Align rdi to VEC_SIZE - 1. */ > > > + orq $(VEC_SIZE - 1), %rdi > > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > vpor %ymm1, %ymm2, %ymm1 > > > - vpmovmskb %ymm1, %eax > > > - /* Remove the leading bits. */ > > > - sarxl %ecx, %eax, %eax > > > + vpmovmskb %ymm1, %eax > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > + so no need to manually mod edx. */ > > > + sarxl %edx, %eax, %eax > > > testl %eax, %eax > > > - jz L(aligned_more) > > > + jz L(cross_page_continue) > > > tzcntl %eax, %eax > > > - addq %rcx, %rdi > > > - addq %rdi, %rax > > > # ifndef USE_AS_STRCHRNUL > > > - cmp (%rax), %CHAR_REG > > > - cmovne %rdx, %rax > > > + xorl %ecx, %ecx > > > + /* Found CHAR or the null byte. */ > > > + cmp (%rdx, %rax), %CHAR_REG > > > + leaq (%rdx, %rax), %rax > > > + cmovne %rcx, %rax > > > +# else > > > + addq %rdx, %rax > > > # endif > > > - VZEROUPPER_RETURN > > > +L(return_vzeroupper): > > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > END (STRCHR) > > > # endif > > > -- > > > 2.29.2 > > > > > > > Thanks. > > > > H.J.
On Fri, Apr 23, 2021 at 3:11 PM H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> wrote: > > On Fri, Apr 23, 2021 at 12:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Fri, Apr 23, 2021 at 12:56 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Thu, Apr 22, 2021 at 02:04:02PM -0400, Noah Goldstein wrote: > > > > No bug. This commit optimizes strchr-avx2.S. The optimizations are all > > > > small things such as save an ALU in the alignment process, saving a > > > > few instructions in the loop return, saving some bytes in the main > > > > loop, and increasing the ILP in the return cases. test-strchr, > > > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > > > --- > > > > sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- > > > > 1 file changed, 173 insertions(+), 121 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > index 25bec38b5d..220165d2ba 100644 > > > > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S > > > > @@ -49,132 +49,144 @@ > > > > > > > > .section SECTION(.text),"ax",@progbits > > > > ENTRY (STRCHR) > > > > - movl %edi, %ecx > > > > -# ifndef USE_AS_STRCHRNUL > > > > - xorl %edx, %edx > > > > -# endif > > > > - > > > > /* Broadcast CHAR to YMM0. */ > > > > vmovd %esi, %xmm0 > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + VPBROADCAST %xmm0, %ymm0 > > > > vpxor %xmm9, %xmm9, %xmm9 > > > > - VPBROADCAST %xmm0, %ymm0 > > > > > > > > /* Check if we cross page boundary with one vector load. */ > > > > - andl $(PAGE_SIZE - 1), %ecx > > > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > > - ja L(cross_page_boundary) > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(cross_page_boundary) > > > > > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > > > null byte. */ > > > > vmovdqu (%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + vpmovmskb %ymm1, %eax > > > > > > This change isn't needed. > > > > Fixed. For the future what is the rule for when to have a tab after > > the instructions vs just a space? > > I prefer if the length of <INSN> is less than a tab, > > <TAB><INSN><TAB><OPERANDS> > > Otherwise > > <TAB><INSN><SPACE><OPERANDS> > > > > > > > > testl %eax, %eax > > > > - jz L(more_vecs) > > > > + jz L(aligned_more) > > > > tzcntl %eax, %eax > > > > - /* Found CHAR or the null byte. */ > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > -L(return_vzeroupper): > > > > - ZERO_UPPER_VEC_REGISTERS_RETURN > > > > - > > > > - .p2align 4 > > > > -L(more_vecs): > > > > - /* Align data for aligned loads in the loop. */ > > > > - andq $-VEC_SIZE, %rdi > > > > -L(aligned_more): > > > > - > > > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > > > - since data is only aligned to VEC_SIZE. */ > > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > > - addq $VEC_SIZE, %rdi > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > - > > > > - vmovdqa VEC_SIZE(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > - > > > > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jnz L(first_vec_x2) > > > > - > > > > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > - vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - testl %eax, %eax > > > > - jz L(prep_loop_4x) > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > > > > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > > > + alignment % 32 was either 16 or 0. As well this makes the > > > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > > > + easier. */ > > > > + .p2align 5 > > > > +L(first_vec_x4): > > > > tzcntl %eax, %eax > > > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > > > + addq $(VEC_SIZE * 3 + 1), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > - .p2align 4 > > > > -L(first_vec_x0): > > > > - tzcntl %eax, %eax > > > > - /* Found CHAR or the null byte. */ > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > -# endif > > > > +L(zero): > > > > + xorl %eax, %eax > > > > VZEROUPPER_RETURN > > > > +# endif > > > > + > > > > > > > > .p2align 4 > > > > L(first_vec_x1): > > > > tzcntl %eax, %eax > > > > - leaq VEC_SIZE(%rdi, %rax), %rax > > > > + incq %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > .p2align 4 > > > > L(first_vec_x2): > > > > tzcntl %eax, %eax > > > > + addq $(VEC_SIZE + 1), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > /* Found CHAR or the null byte. */ > > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + .p2align 4 > > > > +L(first_vec_x3): > > > > + tzcntl %eax, %eax > > > > + addq $(VEC_SIZE * 2 + 1), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > -L(prep_loop_4x): > > > > - /* Align data to 4 * VEC_SIZE. */ > > > > - andq $-(VEC_SIZE * 4), %rdi > > > > + .p2align 4 > > > > +L(aligned_more): > > > > + /* Align data to VEC_SIZE - 1. This is the same number of > > > > + instructions as using andq -VEC_SIZE but saves 4 bytes of code on > > > > + x4 check. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > +L(cross_page_continue): > > > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > > > + data is only aligned to VEC_SIZE. */ > > > > + vmovdqa 1(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > Fixed. > > > > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x1) > > > > > > > > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x2) > > > > + > > > > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x3) > > > > + > > > > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + vpor %ymm1, %ymm2, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + testl %eax, %eax > > > > + jnz L(first_vec_x4) > > > > + /* Align data to VEC_SIZE * 4 - 1. */ > > > > + addq $(VEC_SIZE * 4 + 1), %rdi > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > .p2align 4 > > > > L(loop_4x_vec): > > > > /* Compare 4 * VEC at a time forward. */ > > > > - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 > > > > - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 > > > > - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 > > > > - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 > > > > + vmovdqa (%rdi), %ymm5 > > > > + vmovdqa (VEC_SIZE)(%rdi), %ymm6 > > > > + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 > > > > + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 > > > > > > > > /* Leaves only CHARS matching esi as 0. */ > > > > vpxor %ymm5, %ymm0, %ymm1 > > > > @@ -190,62 +202,102 @@ L(loop_4x_vec): > > > > VPMINU %ymm1, %ymm2, %ymm5 > > > > VPMINU %ymm3, %ymm4, %ymm6 > > > > > > > > - VPMINU %ymm5, %ymm6, %ymm5 > > > > + VPMINU %ymm5, %ymm6, %ymm6 > > > > > > > > - VPCMPEQ %ymm5, %ymm9, %ymm5 > > > > - vpmovmskb %ymm5, %eax > > > > + VPCMPEQ %ymm6, %ymm9, %ymm6 > > > > + vpmovmskb %ymm6, %ecx > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + testl %ecx, %ecx > > > > + jz L(loop_4x_vec) > > > > > > > > - addq $(VEC_SIZE * 4), %rdi > > > > - testl %eax, %eax > > > > - jz L(loop_4x_vec) > > > > > > > > - VPCMPEQ %ymm1, %ymm9, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > + VPCMPEQ %ymm1, %ymm9, %ymm1 > > > > + vpmovmskb %ymm1, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > testl %eax, %eax > > > > - jnz L(first_vec_x0) > > > > + jnz L(last_vec_x0) > > > > + > > > > > > > > - VPCMPEQ %ymm2, %ymm9, %ymm2 > > > > - vpmovmskb %ymm2, %eax > > > > + VPCMPEQ %ymm5, %ymm9, %ymm2 > > > > + vpmovmskb %ymm2, %eax > > > > > > Use a space, not tab, after vpmovmskb. > > > > > > > Fixed. > > > > > > testl %eax, %eax > > > > - jnz L(first_vec_x1) > > > > + jnz L(last_vec_x1) > > > > + > > > > + VPCMPEQ %ymm3, %ymm9, %ymm3 > > > > + vpmovmskb %ymm3, %eax > > > > + /* rcx has combined result from all 4 VEC. It will only be used if > > > > + the first 3 other VEC all did not contain a match. */ > > > > + salq $32, %rcx > > > > + orq %rcx, %rax > > > > + tzcntq %rax, %rax > > > > + subq $(VEC_SIZE * 2), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > > > > > - VPCMPEQ %ymm3, %ymm9, %ymm3 > > > > - VPCMPEQ %ymm4, %ymm9, %ymm4 > > > > - vpmovmskb %ymm3, %ecx > > > > - vpmovmskb %ymm4, %eax > > > > - salq $32, %rax > > > > - orq %rcx, %rax > > > > - tzcntq %rax, %rax > > > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > > > + .p2align 4 > > > > +L(last_vec_x0): > > > > + tzcntl %eax, %eax > > > > + addq $-(VEC_SIZE * 4), %rdi > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > # endif > > > > + addq %rdi, %rax > > > > VZEROUPPER_RETURN > > > > > > > > +# ifndef USE_AS_STRCHRNUL > > > > +L(zero_end): > > > > + xorl %eax, %eax > > > > + VZEROUPPER_RETURN > > > > +# endif > > > > + > > > > + .p2align 4 > > > > +L(last_vec_x1): > > > > + tzcntl %eax, %eax > > > > + subq $(VEC_SIZE * 3), %rdi > > > > +# ifndef USE_AS_STRCHRNUL > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdi, %rax), %CHAR_REG > > > > + jne L(zero_end) > > > > +# endif > > > > + addq %rdi, %rax > > > > + VZEROUPPER_RETURN > > > > + > > > > + > > > > /* Cold case for crossing page with first load. */ > > > > .p2align 4 > > > > L(cross_page_boundary): > > > > - andq $-VEC_SIZE, %rdi > > > > - andl $(VEC_SIZE - 1), %ecx > > > > - > > > > - vmovdqa (%rdi), %ymm8 > > > > - VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > - VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > + movq %rdi, %rdx > > > > + /* Align rdi to VEC_SIZE - 1. */ > > > > + orq $(VEC_SIZE - 1), %rdi > > > > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 > > > > + VPCMPEQ %ymm8, %ymm0, %ymm1 > > > > + VPCMPEQ %ymm8, %ymm9, %ymm2 > > > > vpor %ymm1, %ymm2, %ymm1 > > > > - vpmovmskb %ymm1, %eax > > > > - /* Remove the leading bits. */ > > > > - sarxl %ecx, %eax, %eax > > > > + vpmovmskb %ymm1, %eax > > > > + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT > > > > + so no need to manually mod edx. */ > > > > + sarxl %edx, %eax, %eax > > > > testl %eax, %eax > > > > - jz L(aligned_more) > > > > + jz L(cross_page_continue) > > > > tzcntl %eax, %eax > > > > - addq %rcx, %rdi > > > > - addq %rdi, %rax > > > > # ifndef USE_AS_STRCHRNUL > > > > - cmp (%rax), %CHAR_REG > > > > - cmovne %rdx, %rax > > > > + xorl %ecx, %ecx > > > > + /* Found CHAR or the null byte. */ > > > > + cmp (%rdx, %rax), %CHAR_REG > > > > + leaq (%rdx, %rax), %rax > > > > + cmovne %rcx, %rax > > > > +# else > > > > + addq %rdx, %rax > > > > # endif > > > > - VZEROUPPER_RETURN > > > > +L(return_vzeroupper): > > > > + ZERO_UPPER_VEC_REGISTERS_RETURN > > > > > > > > END (STRCHR) > > > > # endif > > > > -- > > > > 2.29.2 > > > > > > > > > > Thanks. > > > > > > H.J. > > > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 25bec38b5d..220165d2ba 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -49,132 +49,144 @@ .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) - movl %edi, %ecx -# ifndef USE_AS_STRCHRNUL - xorl %edx, %edx -# endif - /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 vpxor %xmm9, %xmm9, %xmm9 - VPBROADCAST %xmm0, %ymm0 /* Check if we cross page boundary with one vector load. */ - andl $(PAGE_SIZE - 1), %ecx - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx - ja L(cross_page_boundary) + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Search for both CHAR and the null byte. */ vmovdqu (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax + vpmovmskb %ymm1, %eax testl %eax, %eax - jz L(more_vecs) + jz L(aligned_more) tzcntl %eax, %eax - /* Found CHAR or the null byte. */ - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - - .p2align 4 -L(more_vecs): - /* Align data for aligned loads in the loop. */ - andq $-VEC_SIZE, %rdi -L(aligned_more): - - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - vmovdqa VEC_SIZE(%rdi), %ymm8 - addq $VEC_SIZE, %rdi - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - vmovdqa VEC_SIZE(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x1) - - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jnz L(first_vec_x2) - - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 - vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - testl %eax, %eax - jz L(prep_loop_4x) + addq %rdi, %rax + VZEROUPPER_RETURN + /* .p2align 5 helps keep performance more consistent if ENTRY() + alignment % 32 was either 16 or 0. As well this makes the + alignment % 32 of the loop_4x_vec fixed which makes tuning it + easier. */ + .p2align 5 +L(first_vec_x4): tzcntl %eax, %eax - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + addq $(VEC_SIZE * 3 + 1), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN - .p2align 4 -L(first_vec_x0): - tzcntl %eax, %eax - /* Found CHAR or the null byte. */ - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax -# endif +L(zero): + xorl %eax, %eax VZEROUPPER_RETURN +# endif + .p2align 4 L(first_vec_x1): tzcntl %eax, %eax - leaq VEC_SIZE(%rdi, %rax), %rax + incq %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): tzcntl %eax, %eax + addq $(VEC_SIZE + 1), %rdi +# ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero) # endif + addq %rdi, %rax VZEROUPPER_RETURN -L(prep_loop_4x): - /* Align data to 4 * VEC_SIZE. */ - andq $-(VEC_SIZE * 4), %rdi + .p2align 4 +L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of + instructions as using andq -VEC_SIZE but saves 4 bytes of code on + x4 check. */ + orq $(VEC_SIZE - 1), %rdi +L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since + data is only aligned to VEC_SIZE. */ + vmovdqa 1(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + /* Align data to VEC_SIZE * 4 - 1. */ + addq $(VEC_SIZE * 4 + 1), %rdi + andq $-(VEC_SIZE * 4), %rdi .p2align 4 L(loop_4x_vec): /* Compare 4 * VEC at a time forward. */ - vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 - vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 - vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 - vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 + vmovdqa (%rdi), %ymm5 + vmovdqa (VEC_SIZE)(%rdi), %ymm6 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 /* Leaves only CHARS matching esi as 0. */ vpxor %ymm5, %ymm0, %ymm1 @@ -190,62 +202,102 @@ L(loop_4x_vec): VPMINU %ymm1, %ymm2, %ymm5 VPMINU %ymm3, %ymm4, %ymm6 - VPMINU %ymm5, %ymm6, %ymm5 + VPMINU %ymm5, %ymm6, %ymm6 - VPCMPEQ %ymm5, %ymm9, %ymm5 - vpmovmskb %ymm5, %eax + VPCMPEQ %ymm6, %ymm9, %ymm6 + vpmovmskb %ymm6, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) - addq $(VEC_SIZE * 4), %rdi - testl %eax, %eax - jz L(loop_4x_vec) - VPCMPEQ %ymm1, %ymm9, %ymm1 - vpmovmskb %ymm1, %eax + VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x0) + - VPCMPEQ %ymm2, %ymm9, %ymm2 - vpmovmskb %ymm2, %eax + VPCMPEQ %ymm5, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax testl %eax, %eax - jnz L(first_vec_x1) + jnz L(last_vec_x1) + + VPCMPEQ %ymm3, %ymm9, %ymm3 + vpmovmskb %ymm3, %eax + /* rcx has combined result from all 4 VEC. It will only be used if + the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2), %rdi +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + - VPCMPEQ %ymm3, %ymm9, %ymm3 - VPCMPEQ %ymm4, %ymm9, %ymm4 - vpmovmskb %ymm3, %ecx - vpmovmskb %ymm4, %eax - salq $32, %rax - orq %rcx, %rax - tzcntq %rax, %rax - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + .p2align 4 +L(last_vec_x0): + tzcntl %eax, %eax + addq $-(VEC_SIZE * 4), %rdi # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) # endif + addq %rdi, %rax VZEROUPPER_RETURN +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +# endif + + .p2align 4 +L(last_vec_x1): + tzcntl %eax, %eax + subq $(VEC_SIZE * 3), %rdi +# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG + jne L(zero_end) +# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + /* Cold case for crossing page with first load. */ .p2align 4 L(cross_page_boundary): - andq $-VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - - vmovdqa (%rdi), %ymm8 - VPCMPEQ %ymm8, %ymm0, %ymm1 - VPCMPEQ %ymm8, %ymm9, %ymm2 + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 vpor %ymm1, %ymm2, %ymm1 - vpmovmskb %ymm1, %eax - /* Remove the leading bits. */ - sarxl %ecx, %eax, %eax + vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax testl %eax, %eax - jz L(aligned_more) + jz L(cross_page_continue) tzcntl %eax, %eax - addq %rcx, %rdi - addq %rdi, %rax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG + leaq (%rdx, %rax), %rax + cmovne %rcx, %rax +# else + addq %rdx, %rax # endif - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN END (STRCHR) # endif
No bug. This commit optimizes strchr-avx2.S. The optimizations are all small things such as save an ALU in the alignment process, saving a few instructions in the loop return, saving some bytes in the main loop, and increasing the ILP in the return cases. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++---------- 1 file changed, 173 insertions(+), 121 deletions(-)