Message ID | 20210421213951.404588-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v1,1/2] x86: Optimize strlen-avx2.S | expand |
On Wed, Apr 21, 2021 at 2:40 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > No bug. This commit optimizes strlen-evex.S. The optimizations are > mostly small things such as save an ALU in the alignment process, > saving a few instructions in the loop return. The one significant > change is saving 2 instructions in the 4x loop. test-strchr, > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- > 1 file changed, 214 insertions(+), 174 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > index ddc86a7058..7cd111e96c 100644 > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > @@ -24,23 +24,26 @@ > # define STRCHR __strchr_evex > # endif > > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > +# define VMOVU vmovdqu64 > +# define VMOVA vmovdqa64 > > # ifdef USE_AS_WCSCHR > # define VPBROADCAST vpbroadcastd > # define VPCMP vpcmpd > # define VPMINU vpminud > # define CHAR_REG esi > -# define SHIFT_REG r8d > +# define SHIFT_REG ecx > +# define CHAR_SIZE 4 > # else > # define VPBROADCAST vpbroadcastb > # define VPCMP vpcmpb > # define VPMINU vpminub > # define CHAR_REG sil > -# define SHIFT_REG ecx > +# define SHIFT_REG edx > +# define CHAR_SIZE 1 > # endif > > + > # define XMMZERO xmm16 > > # define YMMZERO ymm16 > @@ -56,23 +59,20 @@ > > # define VEC_SIZE 32 > # define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section .text.evex,"ax",@progbits > ENTRY (STRCHR) > - movl %edi, %ecx > -# ifndef USE_AS_STRCHRNUL > - xorl %edx, %edx > -# endif > - > /* Broadcast CHAR to YMM0. */ > - VPBROADCAST %esi, %YMM0 > - > + VPBROADCAST %esi, %YMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > - /* Check if we cross page boundary with one vector load. */ > - andl $(PAGE_SIZE - 1), %ecx > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > - ja L(cross_page_boundary) > + /* Check if we cross page boundary with one vector load. Otherwise > + it is safe to use an unaligned load. */ > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > null bytes. */ > @@ -83,251 +83,291 @@ ENTRY (STRCHR) > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > VPCMP $0, %YMMZERO, %YMM2, %k0 > - ktestd %k0, %k0 > - jz L(more_vecs) > kmovd %k0, %eax > + testl %eax, %eax > + jz L(aligned_more) > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + leaq (%rdi, %rax, CHAR_SIZE), %rax > # else > addq %rdi, %rax > # endif > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rax), %CHAR_REG > + jne L(zero) > # endif > ret > > - .p2align 4 > -L(more_vecs): > - /* Align data for aligned loads in the loop. */ > - andq $-VEC_SIZE, %rdi > -L(aligned_more): > - > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - VMOVA VEC_SIZE(%rdi), %YMM1 > - addq $VEC_SIZE, %rdi > - > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VMOVA VEC_SIZE(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > - > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > - > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - ktestd %k0, %k0 > - jz L(prep_loop_4x) > - > - kmovd %k0, %eax > + /* .p2align 5 helps keep performance more consistent if ENTRY() > + alignment % 32 was either 16 or 0. As well this makes the > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > + easier. */ > + .p2align 5 > +L(first_vec_x3): > tzcntl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax > -# else > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > -# endif > +L(zero): > + xorl %eax, %eax > ret > +# endif > > .p2align 4 > -L(first_vec_x0): > +L(first_vec_x4): > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if first match was CHAR (k0) or null (k1). */ > + kmovd %k0, %eax > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + kmovd %k1, %ecx > + /* bzhil will not be 0 if first match was null. */ > + bzhil %eax, %ecx, %ecx > + jne L(zero) > # else > - addq %rdi, %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Combine CHAR and null matches. */ > + kord %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > ret > > .p2align 4 > L(first_vec_x1): > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq VEC_SIZE(%rdi, %rax, 4), %rax > -# else > - leaq VEC_SIZE(%rdi, %rax), %rax > -# endif > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero) > + > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > ret > > .p2align 4 > L(first_vec_x2): > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if first match was CHAR (k0) or null (k1). */ > + kmovd %k0, %eax > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > + kmovd %k1, %ecx > + /* bzhil will not be 0 if first match was null. */ > + bzhil %eax, %ecx, %ecx > + jne L(zero) > # else > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Combine CHAR and null matches. */ > + kord %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > -L(prep_loop_4x): > - /* Align data to 4 * VEC_SIZE. */ > + .p2align 4 > +L(aligned_more): > + /* Align data to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > +L(cross_page_continue): > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > + data is only aligned to VEC_SIZE. Use two alternating methods for > + checking VEC to balance latency and port contention. */ > + > + /* This method has higher latency but has better port > + distribution. */ > + VMOVA (VEC_SIZE)(%rdi), %YMM1 > + /* Leaves only CHARS matching esi as 0. */ > + vpxorq %YMM1, %YMM0, %YMM2 > + VPMINU %YMM2, %YMM1, %YMM2 > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x1) > + > + /* This method has higher latency but has better port > + distribution. */ > + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > + /* Each bit in K0 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMM0, %k0 > + /* Each bit in K1 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMMZERO, %k1 > + kortestd %k0, %k1 > + jnz L(first_vec_x2) > + > + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > + /* Leaves only CHARS matching esi as 0. */ > + vpxorq %YMM1, %YMM0, %YMM2 > + VPMINU %YMM2, %YMM1, %YMM2 > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x3) > + > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > + /* Each bit in K0 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMM0, %k0 > + /* Each bit in K1 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMMZERO, %k1 > + kortestd %k0, %k1 > + jnz L(first_vec_x4) > + > + /* Align data to VEC_SIZE * 4 for the loop. */ > + addq $VEC_SIZE, %rdi > andq $-(VEC_SIZE * 4), %rdi > > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > + /* Check 4x VEC at a time. No penalty to imm32 offset with evex > + encoding. */ > VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 > VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 > > - /* Leaves only CHARS matching esi as 0. */ > + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ > vpxorq %YMM1, %YMM0, %YMM5 > - vpxorq %YMM2, %YMM0, %YMM6 > + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k > + register. Its possible to save either 1 or 2 instructions using cmp no > + equals method for either YMM1 or YMM1 and YMM3 respectively but > + bottleneck on p5 makes it no worth it. */ > + VPCMP $4, %YMM0, %YMM2, %k2 > vpxorq %YMM3, %YMM0, %YMM7 > - vpxorq %YMM4, %YMM0, %YMM8 > - > - VPMINU %YMM5, %YMM1, %YMM5 > - VPMINU %YMM6, %YMM2, %YMM6 > - VPMINU %YMM7, %YMM3, %YMM7 > - VPMINU %YMM8, %YMM4, %YMM8 > - > - VPMINU %YMM5, %YMM6, %YMM1 > - VPMINU %YMM7, %YMM8, %YMM2 > - > - VPMINU %YMM1, %YMM2, %YMM1 > - > - /* Each bit in K0 represents a CHAR or a null byte. */ > - VPCMP $0, %YMMZERO, %YMM1, %k0 > - > - addq $(VEC_SIZE * 4), %rdi > - > - ktestd %k0, %k0 > + VPCMP $4, %YMM0, %YMM4, %k4 > + > + /* Use min to select all zeros (either from xor or end of string). */ > + VPMINU %YMM1, %YMM5, %YMM1 > + VPMINU %YMM3, %YMM7, %YMM3 > + > + /* Use min + zeromask to select for zeros. Since k2 and k4 will be > + have 0 as positions that matched with CHAR which will set zero in > + the corresponding destination bytes in YMM2 / YMM4. */ > + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} > + VPMINU %YMM3, %YMM4, %YMM4 > + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > + > + VPCMP $0, %YMMZERO, %YMM4, %k1 > + kmovd %k1, %ecx > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > jz L(loop_4x_vec) > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM5, %k0 > + VPCMP $0, %YMMZERO, %YMM1, %k0 > kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x0) > + jnz L(last_vec_x1) > > - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ > - VPCMP $0, %YMMZERO, %YMM6, %k1 > - kmovd %k1, %eax > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x1) > - > - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ > - VPCMP $0, %YMMZERO, %YMM7, %k2 > - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ > - VPCMP $0, %YMMZERO, %YMM8, %k3 > + jnz L(last_vec_x2) > > + VPCMP $0, %YMMZERO, %YMM3, %k0 > + kmovd %k0, %eax > + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > # ifdef USE_AS_WCSCHR > - /* NB: Each bit in K2/K3 represents 4-byte element. */ > - kshiftlw $8, %k3, %k1 > + sall $8, %ecx > + orl %ecx, %eax > + tzcntl %eax, %eax > # else > - kshiftlq $32, %k3, %k1 > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > # endif > +# ifndef USE_AS_STRCHRNUL > + /* Check if match was CHAR or null. */ > + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > +# endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > > - /* Each bit in K1 represents a NULL or a mismatch. */ > - korq %k1, %k2, %k1 > - kmovq %k1, %rax > +# ifndef USE_AS_STRCHRNUL > +L(zero_end): > + xorl %eax, %eax > + ret > +# endif > > - tzcntq %rax, %rax > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > -# else > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > + .p2align 4 > +L(last_vec_x1): > + tzcntl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > + /* Check if match was null. */ > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 4 > +L(last_vec_x2): > + tzcntl %eax, %eax > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Check if match was null. */ > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > ret > > /* Cold case for crossing page with first load. */ > .p2align 4 > L(cross_page_boundary): > + movq %rdi, %rdx > + /* Align rdi. */ > andq $-VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - > VMOVA (%rdi), %YMM1 > - > /* Leaves only CHARS matching esi as 0. */ > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > VPCMP $0, %YMMZERO, %YMM2, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - > + /* Remove the leading bits. */ > # ifdef USE_AS_WCSCHR > + movl %edx, %SHIFT_REG > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > bytes. */ > - movl %ecx, %SHIFT_REG > - sarl $2, %SHIFT_REG > + sarl $2, %SHIFT_REG > + andl $(CHAR_PER_VEC - 1), %SHIFT_REG > # endif > - > - /* Remove the leading bits. */ > sarxl %SHIFT_REG, %eax, %eax > + /* If eax is zero continue. */ > testl %eax, %eax > - > - jz L(aligned_more) > + jz L(cross_page_continue) > tzcntl %eax, %eax > - addq %rcx, %rdi > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if match was CHAR or null. */ > + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > +# endif > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + leaq (%rdx, %rax, CHAR_SIZE), %rax > # else > - addq %rdi, %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + addq %rdx, %rax > # endif > ret > > -- > 2.29.2 > Your strlen AVX2 and EVEX patches have been committed: commit aaa23c35071537e2dcf5807e956802ed215210aa Author: Noah Goldstein <goldstein.w.n@gmail.com> Date: Mon Apr 19 19:36:07 2021 -0400 x86: Optimize strlen-avx2.S No bug. This commit optimizes strlen-avx2.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> commit 4ba65586847751372520a36757c17f114588794e Author: Noah Goldstein <goldstein.w.n@gmail.com> Date: Mon Apr 19 19:36:06 2021 -0400 x86: Optimize strlen-evex.S No bug. This commit optimizes strlen-evex.S. The optimizations are mostly small things but they add up to roughly 10-30% performance improvement for strlen. The results for strnlen are bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> Are the new patches incremental improvements? If yes, please rebase them. Thanks.
On Thu, Apr 22, 2021 at 1:08 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Apr 21, 2021 at 2:40 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > No bug. This commit optimizes strlen-evex.S. The optimizations are > > mostly small things such as save an ALU in the alignment process, > > saving a few instructions in the loop return. The one significant > > change is saving 2 instructions in the 4x loop. test-strchr, > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- > > 1 file changed, 214 insertions(+), 174 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > > index ddc86a7058..7cd111e96c 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > > @@ -24,23 +24,26 @@ > > # define STRCHR __strchr_evex > > # endif > > > > -# define VMOVU vmovdqu64 > > -# define VMOVA vmovdqa64 > > +# define VMOVU vmovdqu64 > > +# define VMOVA vmovdqa64 > > > > # ifdef USE_AS_WCSCHR > > # define VPBROADCAST vpbroadcastd > > # define VPCMP vpcmpd > > # define VPMINU vpminud > > # define CHAR_REG esi > > -# define SHIFT_REG r8d > > +# define SHIFT_REG ecx > > +# define CHAR_SIZE 4 > > # else > > # define VPBROADCAST vpbroadcastb > > # define VPCMP vpcmpb > > # define VPMINU vpminub > > # define CHAR_REG sil > > -# define SHIFT_REG ecx > > +# define SHIFT_REG edx > > +# define CHAR_SIZE 1 > > # endif > > > > + > > # define XMMZERO xmm16 > > > > # define YMMZERO ymm16 > > @@ -56,23 +59,20 @@ > > > > # define VEC_SIZE 32 > > # define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > .section .text.evex,"ax",@progbits > > ENTRY (STRCHR) > > - movl %edi, %ecx > > -# ifndef USE_AS_STRCHRNUL > > - xorl %edx, %edx > > -# endif > > - > > /* Broadcast CHAR to YMM0. */ > > - VPBROADCAST %esi, %YMM0 > > - > > + VPBROADCAST %esi, %YMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > > - /* Check if we cross page boundary with one vector load. */ > > - andl $(PAGE_SIZE - 1), %ecx > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > - ja L(cross_page_boundary) > > + /* Check if we cross page boundary with one vector load. Otherwise > > + it is safe to use an unaligned load. */ > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > null bytes. */ > > @@ -83,251 +83,291 @@ ENTRY (STRCHR) > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > VPCMP $0, %YMMZERO, %YMM2, %k0 > > - ktestd %k0, %k0 > > - jz L(more_vecs) > > kmovd %k0, %eax > > + testl %eax, %eax > > + jz L(aligned_more) > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > # else > > addq %rdi, %rax > > # endif > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > # endif > > ret > > > > - .p2align 4 > > -L(more_vecs): > > - /* Align data for aligned loads in the loop. */ > > - andq $-VEC_SIZE, %rdi > > -L(aligned_more): > > - > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - VMOVA VEC_SIZE(%rdi), %YMM1 > > - addq $VEC_SIZE, %rdi > > - > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - VMOVA VEC_SIZE(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > - > > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - ktestd %k0, %k0 > > - jz L(prep_loop_4x) > > - > > - kmovd %k0, %eax > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > + alignment % 32 was either 16 or 0. As well this makes the > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > + easier. */ > > + .p2align 5 > > +L(first_vec_x3): > > tzcntl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax > > -# else > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > -# endif > > +L(zero): > > + xorl %eax, %eax > > ret > > +# endif > > > > .p2align 4 > > -L(first_vec_x0): > > +L(first_vec_x4): > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if first match was CHAR (k0) or null (k1). */ > > + kmovd %k0, %eax > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + kmovd %k1, %ecx > > + /* bzhil will not be 0 if first match was null. */ > > + bzhil %eax, %ecx, %ecx > > + jne L(zero) > > # else > > - addq %rdi, %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Combine CHAR and null matches. */ > > + kord %k0, %k1, %k0 > > + kmovd %k0, %eax > > + tzcntl %eax, %eax > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > .p2align 4 > > L(first_vec_x1): > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq VEC_SIZE(%rdi, %rax, 4), %rax > > -# else > > - leaq VEC_SIZE(%rdi, %rax), %rax > > -# endif > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero) > > + > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > .p2align 4 > > L(first_vec_x2): > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if first match was CHAR (k0) or null (k1). */ > > + kmovd %k0, %eax > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > > + kmovd %k1, %ecx > > + /* bzhil will not be 0 if first match was null. */ > > + bzhil %eax, %ecx, %ecx > > + jne L(zero) > > # else > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Combine CHAR and null matches. */ > > + kord %k0, %k1, %k0 > > + kmovd %k0, %eax > > + tzcntl %eax, %eax > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > -L(prep_loop_4x): > > - /* Align data to 4 * VEC_SIZE. */ > > + .p2align 4 > > +L(aligned_more): > > + /* Align data to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > +L(cross_page_continue): > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > + data is only aligned to VEC_SIZE. Use two alternating methods for > > + checking VEC to balance latency and port contention. */ > > + > > + /* This method has higher latency but has better port > > + distribution. */ > > + VMOVA (VEC_SIZE)(%rdi), %YMM1 > > + /* Leaves only CHARS matching esi as 0. */ > > + vpxorq %YMM1, %YMM0, %YMM2 > > + VPMINU %YMM2, %YMM1, %YMM2 > > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x1) > > + > > + /* This method has higher latency but has better port > > + distribution. */ > > + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > > + /* Each bit in K0 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMM0, %k0 > > + /* Each bit in K1 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMMZERO, %k1 > > + kortestd %k0, %k1 > > + jnz L(first_vec_x2) > > + > > + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > > + /* Leaves only CHARS matching esi as 0. */ > > + vpxorq %YMM1, %YMM0, %YMM2 > > + VPMINU %YMM2, %YMM1, %YMM2 > > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x3) > > + > > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > > + /* Each bit in K0 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMM0, %k0 > > + /* Each bit in K1 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMMZERO, %k1 > > + kortestd %k0, %k1 > > + jnz L(first_vec_x4) > > + > > + /* Align data to VEC_SIZE * 4 for the loop. */ > > + addq $VEC_SIZE, %rdi > > andq $-(VEC_SIZE * 4), %rdi > > > > .p2align 4 > > L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > + /* Check 4x VEC at a time. No penalty to imm32 offset with evex > > + encoding. */ > > VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > > VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 > > VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > > VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 > > > > - /* Leaves only CHARS matching esi as 0. */ > > + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ > > vpxorq %YMM1, %YMM0, %YMM5 > > - vpxorq %YMM2, %YMM0, %YMM6 > > + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k > > + register. Its possible to save either 1 or 2 instructions using cmp no > > + equals method for either YMM1 or YMM1 and YMM3 respectively but > > + bottleneck on p5 makes it no worth it. */ > > + VPCMP $4, %YMM0, %YMM2, %k2 > > vpxorq %YMM3, %YMM0, %YMM7 > > - vpxorq %YMM4, %YMM0, %YMM8 > > - > > - VPMINU %YMM5, %YMM1, %YMM5 > > - VPMINU %YMM6, %YMM2, %YMM6 > > - VPMINU %YMM7, %YMM3, %YMM7 > > - VPMINU %YMM8, %YMM4, %YMM8 > > - > > - VPMINU %YMM5, %YMM6, %YMM1 > > - VPMINU %YMM7, %YMM8, %YMM2 > > - > > - VPMINU %YMM1, %YMM2, %YMM1 > > - > > - /* Each bit in K0 represents a CHAR or a null byte. */ > > - VPCMP $0, %YMMZERO, %YMM1, %k0 > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > - ktestd %k0, %k0 > > + VPCMP $4, %YMM0, %YMM4, %k4 > > + > > + /* Use min to select all zeros (either from xor or end of string). */ > > + VPMINU %YMM1, %YMM5, %YMM1 > > + VPMINU %YMM3, %YMM7, %YMM3 > > + > > + /* Use min + zeromask to select for zeros. Since k2 and k4 will be > > + have 0 as positions that matched with CHAR which will set zero in > > + the corresponding destination bytes in YMM2 / YMM4. */ > > + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} > > + VPMINU %YMM3, %YMM4, %YMM4 > > + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > > + > > + VPCMP $0, %YMMZERO, %YMM4, %k1 > > + kmovd %k1, %ecx > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > jz L(loop_4x_vec) > > > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM5, %k0 > > + VPCMP $0, %YMMZERO, %YMM1, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > - jnz L(first_vec_x0) > > + jnz L(last_vec_x1) > > > > - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ > > - VPCMP $0, %YMMZERO, %YMM6, %k1 > > - kmovd %k1, %eax > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ > > - VPCMP $0, %YMMZERO, %YMM7, %k2 > > - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ > > - VPCMP $0, %YMMZERO, %YMM8, %k3 > > + jnz L(last_vec_x2) > > > > + VPCMP $0, %YMMZERO, %YMM3, %k0 > > + kmovd %k0, %eax > > + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > > # ifdef USE_AS_WCSCHR > > - /* NB: Each bit in K2/K3 represents 4-byte element. */ > > - kshiftlw $8, %k3, %k1 > > + sall $8, %ecx > > + orl %ecx, %eax > > + tzcntl %eax, %eax > > # else > > - kshiftlq $32, %k3, %k1 > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > # endif > > +# ifndef USE_AS_STRCHRNUL > > + /* Check if match was CHAR or null. */ > > + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > > > - /* Each bit in K1 represents a NULL or a mismatch. */ > > - korq %k1, %k2, %k1 > > - kmovq %k1, %rax > > +# ifndef USE_AS_STRCHRNUL > > +L(zero_end): > > + xorl %eax, %eax > > + ret > > +# endif > > > > - tzcntq %rax, %rax > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > > -# else > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + .p2align 4 > > +L(last_vec_x1): > > + tzcntl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > + /* Check if match was null. */ > > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 4 > > +L(last_vec_x2): > > + tzcntl %eax, %eax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Check if match was null. */ > > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > /* Cold case for crossing page with first load. */ > > .p2align 4 > > L(cross_page_boundary): > > + movq %rdi, %rdx > > + /* Align rdi. */ > > andq $-VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - > > VMOVA (%rdi), %YMM1 > > - > > /* Leaves only CHARS matching esi as 0. */ > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > VPCMP $0, %YMMZERO, %YMM2, %k0 > > kmovd %k0, %eax > > - testl %eax, %eax > > - > > + /* Remove the leading bits. */ > > # ifdef USE_AS_WCSCHR > > + movl %edx, %SHIFT_REG > > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > > bytes. */ > > - movl %ecx, %SHIFT_REG > > - sarl $2, %SHIFT_REG > > + sarl $2, %SHIFT_REG > > + andl $(CHAR_PER_VEC - 1), %SHIFT_REG > > # endif > > - > > - /* Remove the leading bits. */ > > sarxl %SHIFT_REG, %eax, %eax > > + /* If eax is zero continue. */ > > testl %eax, %eax > > - > > - jz L(aligned_more) > > + jz L(cross_page_continue) > > tzcntl %eax, %eax > > - addq %rcx, %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if match was CHAR or null. */ > > + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > +# endif > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + leaq (%rdx, %rax, CHAR_SIZE), %rax > > # else > > - addq %rdi, %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + addq %rdx, %rax > > # endif > > ret > > > > -- > > 2.29.2 > > > > Your strlen AVX2 and EVEX patches have been committed: > > commit aaa23c35071537e2dcf5807e956802ed215210aa > Author: Noah Goldstein <goldstein.w.n@gmail.com> > Date: Mon Apr 19 19:36:07 2021 -0400 > > x86: Optimize strlen-avx2.S > > No bug. This commit optimizes strlen-avx2.S. The optimizations are > mostly small things but they add up to roughly 10-30% performance > improvement for strlen. The results for strnlen are bit more > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen > are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > commit 4ba65586847751372520a36757c17f114588794e > Author: Noah Goldstein <goldstein.w.n@gmail.com> > Date: Mon Apr 19 19:36:06 2021 -0400 > > x86: Optimize strlen-evex.S > > No bug. This commit optimizes strlen-evex.S. The > optimizations are mostly small things but they add up to roughly > 10-30% performance improvement for strlen. The results for strnlen are > bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and > test-wcsnlen are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > Are the new patches incremental improvements? If yes, please rebase them. Faulty commit message. These are for strchr. Sorry! Submitted patch with fixed commit messages. > > Thanks. > > -- > H.J.
On Wed, Apr 21, 2021 at 05:39:53PM -0400, Noah Goldstein wrote: > No bug. This commit optimizes strlen-evex.S. The optimizations are > mostly small things such as save an ALU in the alignment process, > saving a few instructions in the loop return. The one significant > change is saving 2 instructions in the 4x loop. test-strchr, > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > --- > sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- > 1 file changed, 214 insertions(+), 174 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > index ddc86a7058..7cd111e96c 100644 > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > @@ -24,23 +24,26 @@ > # define STRCHR __strchr_evex > # endif > > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > +# define VMOVU vmovdqu64 > +# define VMOVA vmovdqa64 These changes aren't needed. > > # ifdef USE_AS_WCSCHR > # define VPBROADCAST vpbroadcastd > # define VPCMP vpcmpd > # define VPMINU vpminud > # define CHAR_REG esi > -# define SHIFT_REG r8d > +# define SHIFT_REG ecx > +# define CHAR_SIZE 4 > # else > # define VPBROADCAST vpbroadcastb > # define VPCMP vpcmpb > # define VPMINU vpminub > # define CHAR_REG sil > -# define SHIFT_REG ecx > +# define SHIFT_REG edx > +# define CHAR_SIZE 1 > # endif > > + No need to add a blank line here. > # define XMMZERO xmm16 > > # define YMMZERO ymm16 > @@ -56,23 +59,20 @@ > > # define VEC_SIZE 32 > # define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > .section .text.evex,"ax",@progbits > ENTRY (STRCHR) > - movl %edi, %ecx > -# ifndef USE_AS_STRCHRNUL > - xorl %edx, %edx > -# endif > - > /* Broadcast CHAR to YMM0. */ > - VPBROADCAST %esi, %YMM0 > - > + VPBROADCAST %esi, %YMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > - /* Check if we cross page boundary with one vector load. */ > - andl $(PAGE_SIZE - 1), %ecx > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > - ja L(cross_page_boundary) > + /* Check if we cross page boundary with one vector load. Otherwise > + it is safe to use an unaligned load. */ > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > null bytes. */ > @@ -83,251 +83,291 @@ ENTRY (STRCHR) > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > VPCMP $0, %YMMZERO, %YMM2, %k0 > - ktestd %k0, %k0 > - jz L(more_vecs) > kmovd %k0, %eax > + testl %eax, %eax > + jz L(aligned_more) > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + leaq (%rdi, %rax, CHAR_SIZE), %rax > # else > addq %rdi, %rax > # endif > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (%rax), %CHAR_REG > + jne L(zero) > # endif > ret > > - .p2align 4 > -L(more_vecs): > - /* Align data for aligned loads in the loop. */ > - andq $-VEC_SIZE, %rdi > -L(aligned_more): > - > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - VMOVA VEC_SIZE(%rdi), %YMM1 > - addq $VEC_SIZE, %rdi > - > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x0) > - > - VMOVA VEC_SIZE(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x1) > - > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(first_vec_x2) > - > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM2, %k0 > - ktestd %k0, %k0 > - jz L(prep_loop_4x) > - > - kmovd %k0, %eax > + /* .p2align 5 helps keep performance more consistent if ENTRY() > + alignment % 32 was either 16 or 0. As well this makes the > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > + easier. */ > + .p2align 5 > +L(first_vec_x3): > tzcntl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax > -# else > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > -# endif > +L(zero): > + xorl %eax, %eax > ret > +# endif > > .p2align 4 > -L(first_vec_x0): > +L(first_vec_x4): > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if first match was CHAR (k0) or null (k1). */ > + kmovd %k0, %eax > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + kmovd %k1, %ecx > + /* bzhil will not be 0 if first match was null. */ > + bzhil %eax, %ecx, %ecx > + jne L(zero) > # else > - addq %rdi, %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Combine CHAR and null matches. */ > + kord %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > ret > > .p2align 4 > L(first_vec_x1): > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq VEC_SIZE(%rdi, %rax, 4), %rax > -# else > - leaq VEC_SIZE(%rdi, %rax), %rax > -# endif > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Found CHAR or the null byte. */ > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero) > + > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > ret > > .p2align 4 > L(first_vec_x2): > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if first match was CHAR (k0) or null (k1). */ > + kmovd %k0, %eax > tzcntl %eax, %eax > - /* Found CHAR or the null byte. */ > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > + kmovd %k1, %ecx > + /* bzhil will not be 0 if first match was null. */ > + bzhil %eax, %ecx, %ecx > + jne L(zero) > # else > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Combine CHAR and null matches. */ > + kord %k0, %k1, %k0 > + kmovd %k0, %eax > + tzcntl %eax, %eax > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > -L(prep_loop_4x): > - /* Align data to 4 * VEC_SIZE. */ > + .p2align 4 > +L(aligned_more): > + /* Align data to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > +L(cross_page_continue): > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > + data is only aligned to VEC_SIZE. Use two alternating methods for > + checking VEC to balance latency and port contention. */ Please limit lines to 72 columns. > + > + /* This method has higher latency but has better port > + distribution. */ > + VMOVA (VEC_SIZE)(%rdi), %YMM1 > + /* Leaves only CHARS matching esi as 0. */ > + vpxorq %YMM1, %YMM0, %YMM2 > + VPMINU %YMM2, %YMM1, %YMM2 > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x1) > + > + /* This method has higher latency but has better port > + distribution. */ > + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > + /* Each bit in K0 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMM0, %k0 > + /* Each bit in K1 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMMZERO, %k1 > + kortestd %k0, %k1 > + jnz L(first_vec_x2) > + > + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > + /* Leaves only CHARS matching esi as 0. */ > + vpxorq %YMM1, %YMM0, %YMM2 > + VPMINU %YMM2, %YMM1, %YMM2 > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > + testl %eax, %eax > + jnz L(first_vec_x3) > + > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > + /* Each bit in K0 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMM0, %k0 > + /* Each bit in K1 represents a CHAR in YMM1. */ > + VPCMP $0, %YMM1, %YMMZERO, %k1 > + kortestd %k0, %k1 > + jnz L(first_vec_x4) > + > + /* Align data to VEC_SIZE * 4 for the loop. */ > + addq $VEC_SIZE, %rdi > andq $-(VEC_SIZE * 4), %rdi > > .p2align 4 > L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > + /* Check 4x VEC at a time. No penalty to imm32 offset with evex > + encoding. */ > VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 > VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 > > - /* Leaves only CHARS matching esi as 0. */ > + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ Please limit lines to 72 columns. > vpxorq %YMM1, %YMM0, %YMM5 > - vpxorq %YMM2, %YMM0, %YMM6 > + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k > + register. Its possible to save either 1 or 2 instructions using cmp no > + equals method for either YMM1 or YMM1 and YMM3 respectively but > + bottleneck on p5 makes it no worth it. */ Please limit lines to 72 columns. > + VPCMP $4, %YMM0, %YMM2, %k2 > vpxorq %YMM3, %YMM0, %YMM7 > - vpxorq %YMM4, %YMM0, %YMM8 > - > - VPMINU %YMM5, %YMM1, %YMM5 > - VPMINU %YMM6, %YMM2, %YMM6 > - VPMINU %YMM7, %YMM3, %YMM7 > - VPMINU %YMM8, %YMM4, %YMM8 > - > - VPMINU %YMM5, %YMM6, %YMM1 > - VPMINU %YMM7, %YMM8, %YMM2 > - > - VPMINU %YMM1, %YMM2, %YMM1 > - > - /* Each bit in K0 represents a CHAR or a null byte. */ > - VPCMP $0, %YMMZERO, %YMM1, %k0 > - > - addq $(VEC_SIZE * 4), %rdi > - > - ktestd %k0, %k0 > + VPCMP $4, %YMM0, %YMM4, %k4 > + > + /* Use min to select all zeros (either from xor or end of string). */ Please limit lines to 72 columns. > + VPMINU %YMM1, %YMM5, %YMM1 > + VPMINU %YMM3, %YMM7, %YMM3 > + > + /* Use min + zeromask to select for zeros. Since k2 and k4 will be > + have 0 as positions that matched with CHAR which will set zero in > + the corresponding destination bytes in YMM2 / YMM4. */ Please limit lines to 72 columns. > + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} > + VPMINU %YMM3, %YMM4, %YMM4 > + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > + > + VPCMP $0, %YMMZERO, %YMM4, %k1 > + kmovd %k1, %ecx > + subq $-(VEC_SIZE * 4), %rdi > + testl %ecx, %ecx > jz L(loop_4x_vec) > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPCMP $0, %YMMZERO, %YMM5, %k0 > + VPCMP $0, %YMMZERO, %YMM1, %k0 > kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x0) > + jnz L(last_vec_x1) > > - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ > - VPCMP $0, %YMMZERO, %YMM6, %k1 > - kmovd %k1, %eax > + VPCMP $0, %YMMZERO, %YMM2, %k0 > + kmovd %k0, %eax > testl %eax, %eax > - jnz L(first_vec_x1) > - > - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ > - VPCMP $0, %YMMZERO, %YMM7, %k2 > - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ > - VPCMP $0, %YMMZERO, %YMM8, %k3 > + jnz L(last_vec_x2) > > + VPCMP $0, %YMMZERO, %YMM3, %k0 > + kmovd %k0, %eax > + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > # ifdef USE_AS_WCSCHR > - /* NB: Each bit in K2/K3 represents 4-byte element. */ > - kshiftlw $8, %k3, %k1 > + sall $8, %ecx > + orl %ecx, %eax > + tzcntl %eax, %eax > # else > - kshiftlq $32, %k3, %k1 > + salq $32, %rcx > + orq %rcx, %rax > + tzcntq %rax, %rax > # endif > +# ifndef USE_AS_STRCHRNUL > + /* Check if match was CHAR or null. */ > + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > +# endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + ret > > - /* Each bit in K1 represents a NULL or a mismatch. */ > - korq %k1, %k2, %k1 > - kmovq %k1, %rax > +# ifndef USE_AS_STRCHRNUL > +L(zero_end): > + xorl %eax, %eax > + ret > +# endif > > - tzcntq %rax, %rax > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > -# else > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > + .p2align 4 > +L(last_vec_x1): > + tzcntl %eax, %eax > +# ifndef USE_AS_STRCHRNUL > + /* Check if match was null. */ > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + .p2align 4 > +L(last_vec_x2): > + tzcntl %eax, %eax > # ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + /* Check if match was null. */ > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > # endif > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > + bytes. */ > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > ret > > /* Cold case for crossing page with first load. */ > .p2align 4 > L(cross_page_boundary): > + movq %rdi, %rdx > + /* Align rdi. */ > andq $-VEC_SIZE, %rdi > - andl $(VEC_SIZE - 1), %ecx > - > VMOVA (%rdi), %YMM1 > - > /* Leaves only CHARS matching esi as 0. */ > vpxorq %YMM1, %YMM0, %YMM2 > VPMINU %YMM2, %YMM1, %YMM2 > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > VPCMP $0, %YMMZERO, %YMM2, %k0 > kmovd %k0, %eax > - testl %eax, %eax > - > + /* Remove the leading bits. */ > # ifdef USE_AS_WCSCHR > + movl %edx, %SHIFT_REG > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > bytes. */ > - movl %ecx, %SHIFT_REG > - sarl $2, %SHIFT_REG > + sarl $2, %SHIFT_REG > + andl $(CHAR_PER_VEC - 1), %SHIFT_REG > # endif > - > - /* Remove the leading bits. */ > sarxl %SHIFT_REG, %eax, %eax > + /* If eax is zero continue. */ > testl %eax, %eax > - > - jz L(aligned_more) > + jz L(cross_page_continue) > tzcntl %eax, %eax > - addq %rcx, %rdi > +# ifndef USE_AS_STRCHRNUL > + /* Check to see if match was CHAR or null. */ > + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > + jne L(zero_end) > +# endif > # ifdef USE_AS_WCSCHR > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > - leaq (%rdi, %rax, 4), %rax > + leaq (%rdx, %rax, CHAR_SIZE), %rax > # else > - addq %rdi, %rax > -# endif > -# ifndef USE_AS_STRCHRNUL > - cmp (%rax), %CHAR_REG > - cmovne %rdx, %rax > + addq %rdx, %rax > # endif > ret > > -- > 2.29.2 > Thanks. H.J.
On Fri, Apr 23, 2021 at 1:03 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Wed, Apr 21, 2021 at 05:39:53PM -0400, Noah Goldstein wrote: > > No bug. This commit optimizes strlen-evex.S. The optimizations are > > mostly small things such as save an ALU in the alignment process, > > saving a few instructions in the loop return. The one significant > > change is saving 2 instructions in the 4x loop. test-strchr, > > test-strchrnul, test-wcschr, and test-wcschrnul are all passing. > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> > > --- > > sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- > > 1 file changed, 214 insertions(+), 174 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > > index ddc86a7058..7cd111e96c 100644 > > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > > @@ -24,23 +24,26 @@ > > # define STRCHR __strchr_evex > > # endif > > > > -# define VMOVU vmovdqu64 > > -# define VMOVA vmovdqa64 > > +# define VMOVU vmovdqu64 > > +# define VMOVA vmovdqa64 > > These changes aren't needed. Fixed. > > > > > # ifdef USE_AS_WCSCHR > > # define VPBROADCAST vpbroadcastd > > # define VPCMP vpcmpd > > # define VPMINU vpminud > > # define CHAR_REG esi > > -# define SHIFT_REG r8d > > +# define SHIFT_REG ecx > > +# define CHAR_SIZE 4 > > # else > > # define VPBROADCAST vpbroadcastb > > # define VPCMP vpcmpb > > # define VPMINU vpminub > > # define CHAR_REG sil > > -# define SHIFT_REG ecx > > +# define SHIFT_REG edx > > +# define CHAR_SIZE 1 > > # endif > > > > + > > No need to add a blank line here. Fixed. > > > # define XMMZERO xmm16 > > > > # define YMMZERO ymm16 > > @@ -56,23 +59,20 @@ > > > > # define VEC_SIZE 32 > > # define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > .section .text.evex,"ax",@progbits > > ENTRY (STRCHR) > > - movl %edi, %ecx > > -# ifndef USE_AS_STRCHRNUL > > - xorl %edx, %edx > > -# endif > > - > > /* Broadcast CHAR to YMM0. */ > > - VPBROADCAST %esi, %YMM0 > > - > > + VPBROADCAST %esi, %YMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > > - /* Check if we cross page boundary with one vector load. */ > > - andl $(PAGE_SIZE - 1), %ecx > > - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > - ja L(cross_page_boundary) > > + /* Check if we cross page boundary with one vector load. Otherwise > > + it is safe to use an unaligned load. */ > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > > null bytes. */ > > @@ -83,251 +83,291 @@ ENTRY (STRCHR) > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > VPCMP $0, %YMMZERO, %YMM2, %k0 > > - ktestd %k0, %k0 > > - jz L(more_vecs) > > kmovd %k0, %eax > > + testl %eax, %eax > > + jz L(aligned_more) > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > # else > > addq %rdi, %rax > > # endif > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > # endif > > ret > > > > - .p2align 4 > > -L(more_vecs): > > - /* Align data for aligned loads in the loop. */ > > - andq $-VEC_SIZE, %rdi > > -L(aligned_more): > > - > > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > > - since data is only aligned to VEC_SIZE. */ > > - VMOVA VEC_SIZE(%rdi), %YMM1 > > - addq $VEC_SIZE, %rdi > > - > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - > > - VMOVA VEC_SIZE(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - kmovd %k0, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > - > > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > > - /* Leaves only CHARS matching esi as 0. */ > > - vpxorq %YMM1, %YMM0, %YMM2 > > - VPMINU %YMM2, %YMM1, %YMM2 > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM2, %k0 > > - ktestd %k0, %k0 > > - jz L(prep_loop_4x) > > - > > - kmovd %k0, %eax > > + /* .p2align 5 helps keep performance more consistent if ENTRY() > > + alignment % 32 was either 16 or 0. As well this makes the > > + alignment % 32 of the loop_4x_vec fixed which makes tuning it > > + easier. */ > > + .p2align 5 > > +L(first_vec_x3): > > tzcntl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax > > -# else > > - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax > > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > -# endif > > +L(zero): > > + xorl %eax, %eax > > ret > > +# endif > > > > .p2align 4 > > -L(first_vec_x0): > > +L(first_vec_x4): > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if first match was CHAR (k0) or null (k1). */ > > + kmovd %k0, %eax > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + kmovd %k1, %ecx > > + /* bzhil will not be 0 if first match was null. */ > > + bzhil %eax, %ecx, %ecx > > + jne L(zero) > > # else > > - addq %rdi, %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Combine CHAR and null matches. */ > > + kord %k0, %k1, %k0 > > + kmovd %k0, %eax > > + tzcntl %eax, %eax > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > .p2align 4 > > L(first_vec_x1): > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq VEC_SIZE(%rdi, %rax, 4), %rax > > -# else > > - leaq VEC_SIZE(%rdi, %rax), %rax > > -# endif > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Found CHAR or the null byte. */ > > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero) > > + > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > .p2align 4 > > L(first_vec_x2): > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if first match was CHAR (k0) or null (k1). */ > > + kmovd %k0, %eax > > tzcntl %eax, %eax > > - /* Found CHAR or the null byte. */ > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > > + kmovd %k1, %ecx > > + /* bzhil will not be 0 if first match was null. */ > > + bzhil %eax, %ecx, %ecx > > + jne L(zero) > > # else > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Combine CHAR and null matches. */ > > + kord %k0, %k1, %k0 > > + kmovd %k0, %eax > > + tzcntl %eax, %eax > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > -L(prep_loop_4x): > > - /* Align data to 4 * VEC_SIZE. */ > > + .p2align 4 > > +L(aligned_more): > > + /* Align data to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rdi > > +L(cross_page_continue): > > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > > + data is only aligned to VEC_SIZE. Use two alternating methods for > > + checking VEC to balance latency and port contention. */ > > Please limit lines to 72 columns. > Fixed. > > + > > + /* This method has higher latency but has better port > > + distribution. */ > > + VMOVA (VEC_SIZE)(%rdi), %YMM1 > > + /* Leaves only CHARS matching esi as 0. */ > > + vpxorq %YMM1, %YMM0, %YMM2 > > + VPMINU %YMM2, %YMM1, %YMM2 > > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x1) > > + > > + /* This method has higher latency but has better port > > + distribution. */ > > + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > > + /* Each bit in K0 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMM0, %k0 > > + /* Each bit in K1 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMMZERO, %k1 > > + kortestd %k0, %k1 > > + jnz L(first_vec_x2) > > + > > + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > > + /* Leaves only CHARS matching esi as 0. */ > > + vpxorq %YMM1, %YMM0, %YMM2 > > + VPMINU %YMM2, %YMM1, %YMM2 > > + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x3) > > + > > + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > > + /* Each bit in K0 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMM0, %k0 > > + /* Each bit in K1 represents a CHAR in YMM1. */ > > + VPCMP $0, %YMM1, %YMMZERO, %k1 > > + kortestd %k0, %k1 > > + jnz L(first_vec_x4) > > + > > + /* Align data to VEC_SIZE * 4 for the loop. */ > > + addq $VEC_SIZE, %rdi > > andq $-(VEC_SIZE * 4), %rdi > > > > .p2align 4 > > L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > + /* Check 4x VEC at a time. No penalty to imm32 offset with evex > > + encoding. */ > > VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > > VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 > > VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > > VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 > > > > - /* Leaves only CHARS matching esi as 0. */ > > + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ > > Please limit lines to 72 columns. > Fixed. > > vpxorq %YMM1, %YMM0, %YMM5 > > - vpxorq %YMM2, %YMM0, %YMM6 > > + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k > > + register. Its possible to save either 1 or 2 instructions using cmp no > > + equals method for either YMM1 or YMM1 and YMM3 respectively but > > + bottleneck on p5 makes it no worth it. */ > > Please limit lines to 72 columns. > Fixed. > > + VPCMP $4, %YMM0, %YMM2, %k2 > > vpxorq %YMM3, %YMM0, %YMM7 > > - vpxorq %YMM4, %YMM0, %YMM8 > > - > > - VPMINU %YMM5, %YMM1, %YMM5 > > - VPMINU %YMM6, %YMM2, %YMM6 > > - VPMINU %YMM7, %YMM3, %YMM7 > > - VPMINU %YMM8, %YMM4, %YMM8 > > - > > - VPMINU %YMM5, %YMM6, %YMM1 > > - VPMINU %YMM7, %YMM8, %YMM2 > > - > > - VPMINU %YMM1, %YMM2, %YMM1 > > - > > - /* Each bit in K0 represents a CHAR or a null byte. */ > > - VPCMP $0, %YMMZERO, %YMM1, %k0 > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > - ktestd %k0, %k0 > > + VPCMP $4, %YMM0, %YMM4, %k4 > > + > > + /* Use min to select all zeros (either from xor or end of string). */ > > Please limit lines to 72 columns. Fixed. > > > + VPMINU %YMM1, %YMM5, %YMM1 > > + VPMINU %YMM3, %YMM7, %YMM3 > > + > > + /* Use min + zeromask to select for zeros. Since k2 and k4 will be > > + have 0 as positions that matched with CHAR which will set zero in > > + the corresponding destination bytes in YMM2 / YMM4. */ > > Please limit lines to 72 columns. Fixed. > > > + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} > > + VPMINU %YMM3, %YMM4, %YMM4 > > + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > > + > > + VPCMP $0, %YMMZERO, %YMM4, %k1 > > + kmovd %k1, %ecx > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > jz L(loop_4x_vec) > > > > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > - VPCMP $0, %YMMZERO, %YMM5, %k0 > > + VPCMP $0, %YMMZERO, %YMM1, %k0 > > kmovd %k0, %eax > > testl %eax, %eax > > - jnz L(first_vec_x0) > > + jnz L(last_vec_x1) > > > > - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ > > - VPCMP $0, %YMMZERO, %YMM6, %k1 > > - kmovd %k1, %eax > > + VPCMP $0, %YMMZERO, %YMM2, %k0 > > + kmovd %k0, %eax > > testl %eax, %eax > > - jnz L(first_vec_x1) > > - > > - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ > > - VPCMP $0, %YMMZERO, %YMM7, %k2 > > - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ > > - VPCMP $0, %YMMZERO, %YMM8, %k3 > > + jnz L(last_vec_x2) > > > > + VPCMP $0, %YMMZERO, %YMM3, %k0 > > + kmovd %k0, %eax > > + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > > # ifdef USE_AS_WCSCHR > > - /* NB: Each bit in K2/K3 represents 4-byte element. */ > > - kshiftlw $8, %k3, %k1 > > + sall $8, %ecx > > + orl %ecx, %eax > > + tzcntl %eax, %eax > > # else > > - kshiftlq $32, %k3, %k1 > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > # endif > > +# ifndef USE_AS_STRCHRNUL > > + /* Check if match was CHAR or null. */ > > + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > +# endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > > > - /* Each bit in K1 represents a NULL or a mismatch. */ > > - korq %k1, %k2, %k1 > > - kmovq %k1, %rax > > +# ifndef USE_AS_STRCHRNUL > > +L(zero_end): > > + xorl %eax, %eax > > + ret > > +# endif > > > > - tzcntq %rax, %rax > > -# ifdef USE_AS_WCSCHR > > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax > > -# else > > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > > + .p2align 4 > > +L(last_vec_x1): > > + tzcntl %eax, %eax > > +# ifndef USE_AS_STRCHRNUL > > + /* Check if match was null. */ > > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + .p2align 4 > > +L(last_vec_x2): > > + tzcntl %eax, %eax > > # ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + /* Check if match was null. */ > > + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > # endif > > + /* NB: Multiply sizeof char type (1 or 4) to get the number of > > + bytes. */ > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > ret > > > > /* Cold case for crossing page with first load. */ > > .p2align 4 > > L(cross_page_boundary): > > + movq %rdi, %rdx > > + /* Align rdi. */ > > andq $-VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - > > VMOVA (%rdi), %YMM1 > > - > > /* Leaves only CHARS matching esi as 0. */ > > vpxorq %YMM1, %YMM0, %YMM2 > > VPMINU %YMM2, %YMM1, %YMM2 > > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > > VPCMP $0, %YMMZERO, %YMM2, %k0 > > kmovd %k0, %eax > > - testl %eax, %eax > > - > > + /* Remove the leading bits. */ > > # ifdef USE_AS_WCSCHR > > + movl %edx, %SHIFT_REG > > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > > bytes. */ > > - movl %ecx, %SHIFT_REG > > - sarl $2, %SHIFT_REG > > + sarl $2, %SHIFT_REG > > + andl $(CHAR_PER_VEC - 1), %SHIFT_REG > > # endif > > - > > - /* Remove the leading bits. */ > > sarxl %SHIFT_REG, %eax, %eax > > + /* If eax is zero continue. */ > > testl %eax, %eax > > - > > - jz L(aligned_more) > > + jz L(cross_page_continue) > > tzcntl %eax, %eax > > - addq %rcx, %rdi > > +# ifndef USE_AS_STRCHRNUL > > + /* Check to see if match was CHAR or null. */ > > + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG > > + jne L(zero_end) > > +# endif > > # ifdef USE_AS_WCSCHR > > /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ > > - leaq (%rdi, %rax, 4), %rax > > + leaq (%rdx, %rax, CHAR_SIZE), %rax > > # else > > - addq %rdi, %rax > > -# endif > > -# ifndef USE_AS_STRCHRNUL > > - cmp (%rax), %CHAR_REG > > - cmovne %rdx, %rax > > + addq %rdx, %rax > > # endif > > ret > > > > -- > > 2.29.2 > > > > Thanks. > > H.J.
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index ddc86a7058..7cd111e96c 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -24,23 +24,26 @@ # define STRCHR __strchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd # define VPCMP vpcmpd # define VPMINU vpminud # define CHAR_REG esi -# define SHIFT_REG r8d +# define SHIFT_REG ecx +# define CHAR_SIZE 4 # else # define VPBROADCAST vpbroadcastb # define VPCMP vpcmpb # define VPMINU vpminub # define CHAR_REG sil -# define SHIFT_REG ecx +# define SHIFT_REG edx +# define CHAR_SIZE 1 # endif + # define XMMZERO xmm16 # define YMMZERO ymm16 @@ -56,23 +59,20 @@ # define VEC_SIZE 32 # define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section .text.evex,"ax",@progbits ENTRY (STRCHR) - movl %edi, %ecx -# ifndef USE_AS_STRCHRNUL - xorl %edx, %edx -# endif - /* Broadcast CHAR to YMM0. */ - VPBROADCAST %esi, %YMM0 - + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Check if we cross page boundary with one vector load. */ - andl $(PAGE_SIZE - 1), %ecx - cmpl $(PAGE_SIZE - VEC_SIZE), %ecx - ja L(cross_page_boundary) + /* Check if we cross page boundary with one vector load. Otherwise + it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Search for both CHAR and the null bytes. */ @@ -83,251 +83,291 @@ ENTRY (STRCHR) VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ VPCMP $0, %YMMZERO, %YMM2, %k0 - ktestd %k0, %k0 - jz L(more_vecs) kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) tzcntl %eax, %eax - /* Found CHAR or the null byte. */ # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax # endif # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (%rax), %CHAR_REG + jne L(zero) # endif ret - .p2align 4 -L(more_vecs): - /* Align data for aligned loads in the loop. */ - andq $-VEC_SIZE, %rdi -L(aligned_more): - - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ - VMOVA VEC_SIZE(%rdi), %YMM1 - addq $VEC_SIZE, %rdi - - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x0) - - VMOVA VEC_SIZE(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x1) - - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(first_vec_x2) - - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM2, %k0 - ktestd %k0, %k0 - jz L(prep_loop_4x) - - kmovd %k0, %eax + /* .p2align 5 helps keep performance more consistent if ENTRY() + alignment % 32 was either 16 or 0. As well this makes the + alignment % 32 of the loop_4x_vec fixed which makes tuning it + easier. */ + .p2align 5 +L(first_vec_x3): tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax -# else - leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax -# endif +L(zero): + xorl %eax, %eax ret +# endif .p2align 4 -L(first_vec_x0): +L(first_vec_x4): +# ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ + kmovd %k0, %eax tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + kmovd %k1, %ecx + /* bzhil will not be 0 if first match was null. */ + bzhil %eax, %ecx, %ecx + jne L(zero) # else - addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Combine CHAR and null matches. */ + kord %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ret .p2align 4 L(first_vec_x1): tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq VEC_SIZE(%rdi, %rax, 4), %rax -# else - leaq VEC_SIZE(%rdi, %rax), %rax -# endif # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero) + # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret .p2align 4 L(first_vec_x2): +# ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ + kmovd %k0, %eax tzcntl %eax, %eax - /* Found CHAR or the null byte. */ -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax + kmovd %k1, %ecx + /* bzhil will not be 0 if first match was null. */ + bzhil %eax, %ecx, %ecx + jne L(zero) # else - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Combine CHAR and null matches. */ + kord %k0, %k1, %k0 + kmovd %k0, %eax + tzcntl %eax, %eax # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret -L(prep_loop_4x): - /* Align data to 4 * VEC_SIZE. */ + .p2align 4 +L(aligned_more): + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi +L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since + data is only aligned to VEC_SIZE. Use two alternating methods for + checking VEC to balance latency and port contention. */ + + /* This method has higher latency but has better port + distribution. */ + VMOVA (VEC_SIZE)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + /* This method has higher latency but has better port + distribution. */ + VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMMZERO, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMMZERO, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + + /* Align data to VEC_SIZE * 4 for the loop. */ + addq $VEC_SIZE, %rdi andq $-(VEC_SIZE * 4), %rdi .p2align 4 L(loop_4x_vec): - /* Compare 4 * VEC at a time forward. */ + /* Check 4x VEC at a time. No penalty to imm32 offset with evex + encoding. */ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 - /* Leaves only CHARS matching esi as 0. */ + /* For YMM1 and YMM3 use xor to set the CHARs matching esi to zero. */ vpxorq %YMM1, %YMM0, %YMM5 - vpxorq %YMM2, %YMM0, %YMM6 + /* For YMM2 and YMM4 cmp not equals to CHAR and store result in k + register. Its possible to save either 1 or 2 instructions using cmp no + equals method for either YMM1 or YMM1 and YMM3 respectively but + bottleneck on p5 makes it no worth it. */ + VPCMP $4, %YMM0, %YMM2, %k2 vpxorq %YMM3, %YMM0, %YMM7 - vpxorq %YMM4, %YMM0, %YMM8 - - VPMINU %YMM5, %YMM1, %YMM5 - VPMINU %YMM6, %YMM2, %YMM6 - VPMINU %YMM7, %YMM3, %YMM7 - VPMINU %YMM8, %YMM4, %YMM8 - - VPMINU %YMM5, %YMM6, %YMM1 - VPMINU %YMM7, %YMM8, %YMM2 - - VPMINU %YMM1, %YMM2, %YMM1 - - /* Each bit in K0 represents a CHAR or a null byte. */ - VPCMP $0, %YMMZERO, %YMM1, %k0 - - addq $(VEC_SIZE * 4), %rdi - - ktestd %k0, %k0 + VPCMP $4, %YMM0, %YMM4, %k4 + + /* Use min to select all zeros (either from xor or end of string). */ + VPMINU %YMM1, %YMM5, %YMM1 + VPMINU %YMM3, %YMM7, %YMM3 + + /* Use min + zeromask to select for zeros. Since k2 and k4 will be + have 0 as positions that matched with CHAR which will set zero in + the corresponding destination bytes in YMM2 / YMM4. */ + VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + + VPCMP $0, %YMMZERO, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx jz L(loop_4x_vec) - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPCMP $0, %YMMZERO, %YMM5, %k0 + VPCMP $0, %YMMZERO, %YMM1, %k0 kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x0) + jnz L(last_vec_x1) - /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ - VPCMP $0, %YMMZERO, %YMM6, %k1 - kmovd %k1, %eax + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax testl %eax, %eax - jnz L(first_vec_x1) - - /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ - VPCMP $0, %YMMZERO, %YMM7, %k2 - /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ - VPCMP $0, %YMMZERO, %YMM8, %k3 + jnz L(last_vec_x2) + VPCMP $0, %YMMZERO, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ # ifdef USE_AS_WCSCHR - /* NB: Each bit in K2/K3 represents 4-byte element. */ - kshiftlw $8, %k3, %k1 + sall $8, %ecx + orl %ecx, %eax + tzcntl %eax, %eax # else - kshiftlq $32, %k3, %k1 + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax # endif +# ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ + cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) +# endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret - /* Each bit in K1 represents a NULL or a mismatch. */ - korq %k1, %k2, %k1 - kmovq %k1, %rax +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + ret +# endif - tzcntq %rax, %rax -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax -# else - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + .p2align 4 +L(last_vec_x1): + tzcntl %eax, %eax +# ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +L(last_vec_x2): + tzcntl %eax, %eax # ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) # endif + /* NB: Multiply sizeof char type (1 or 4) to get the number of + bytes. */ + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret /* Cold case for crossing page with first load. */ .p2align 4 L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ andq $-VEC_SIZE, %rdi - andl $(VEC_SIZE - 1), %ecx - VMOVA (%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ vpxorq %YMM1, %YMM0, %YMM2 VPMINU %YMM2, %YMM1, %YMM2 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ VPCMP $0, %YMMZERO, %YMM2, %k0 kmovd %k0, %eax - testl %eax, %eax - + /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG /* NB: Divide shift count by 4 since each bit in K1 represent 4 bytes. */ - movl %ecx, %SHIFT_REG - sarl $2, %SHIFT_REG + sarl $2, %SHIFT_REG + andl $(CHAR_PER_VEC - 1), %SHIFT_REG # endif - - /* Remove the leading bits. */ sarxl %SHIFT_REG, %eax, %eax + /* If eax is zero continue. */ testl %eax, %eax - - jz L(aligned_more) + jz L(cross_page_continue) tzcntl %eax, %eax - addq %rcx, %rdi +# ifndef USE_AS_STRCHRNUL + /* Check to see if match was CHAR or null. */ + cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG + jne L(zero_end) +# endif # ifdef USE_AS_WCSCHR /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ - leaq (%rdi, %rax, 4), %rax + leaq (%rdx, %rax, CHAR_SIZE), %rax # else - addq %rdi, %rax -# endif -# ifndef USE_AS_STRCHRNUL - cmp (%rax), %CHAR_REG - cmovne %rdx, %rax + addq %rdx, %rax # endif ret
No bug. This commit optimizes strlen-evex.S. The optimizations are mostly small things such as save an ALU in the alignment process, saving a few instructions in the loop return. The one significant change is saving 2 instructions in the 4x loop. test-strchr, test-strchrnul, test-wcschr, and test-wcschrnul are all passing. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com> --- sysdeps/x86_64/multiarch/strchr-evex.S | 388 ++++++++++++++----------- 1 file changed, 214 insertions(+), 174 deletions(-)