Message ID | 20221019004409.3623395-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Size Optimizations: > 1. Condence hot path for better cache-locality. > - This is most impact for strchrnul where the logic strings with > len <= VEC_SIZE or with a match in the first VEC no fits entirely > in the first cache line. > 2. Reuse common targets in first 4x VEC and after the loop. > 3. Don't align targets so aggressively if it doesn't change the number > of fetch blocks it will require and put more care in avoiding the > case where targets unnecessarily split cache lines. > 4. Align the loop better for DSB/LSD > 5. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 6. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > Code Size Changes: > strchr-evex.S : -63 bytes > strchrnul-evex.S: -48 bytes > > Net perf changes: > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > strchr-evex.S (Fixed) : 0.971 > strchr-evex.S (Rand) : 0.932 > strchrnul-evex.S : 0.965 > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++---------- > 1 file changed, 340 insertions(+), 218 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S > index a1c15c4419..c2a0d112f7 100644 > --- a/sysdeps/x86_64/multiarch/strchr-evex.S > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S > @@ -26,48 +26,75 @@ > # define STRCHR __strchr_evex > # endif > > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > > # ifdef USE_AS_WCSCHR > # define VPBROADCAST vpbroadcastd > -# define VPCMP vpcmpd > +# define VPCMP vpcmpd > +# define VPCMPEQ vpcmpeqd > # define VPTESTN vptestnmd > +# define VPTEST vptestmd > # define VPMINU vpminud > # define CHAR_REG esi > -# define SHIFT_REG ecx > +# define SHIFT_REG rcx > # define CHAR_SIZE 4 > + > +# define USE_WIDE_CHAR > # else > # define VPBROADCAST vpbroadcastb > -# define VPCMP vpcmpb > +# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > # define VPTESTN vptestnmb > +# define VPTEST vptestmb > # define VPMINU vpminub > # define CHAR_REG sil > -# define SHIFT_REG edx > +# define SHIFT_REG rdi > # define CHAR_SIZE 1 > # endif > > -# define XMMZERO xmm16 > - > -# define YMMZERO ymm16 > -# define YMM0 ymm17 > -# define YMM1 ymm18 > -# define YMM2 ymm19 > -# define YMM3 ymm20 > -# define YMM4 ymm21 > -# define YMM5 ymm22 > -# define YMM6 ymm23 > -# define YMM7 ymm24 > -# define YMM8 ymm25 > - > -# define VEC_SIZE 32 > -# define PAGE_SIZE 4096 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > - .section .text.evex,"ax",@progbits > -ENTRY_P2ALIGN (STRCHR, 5) > - /* Broadcast CHAR to YMM0. */ > - VPBROADCAST %esi, %YMM0 > +# include "reg-macros.h" > + > +# if VEC_SIZE == 64 > +# define MASK_GPR rcx > +# define LOOP_REG rax > + > +# define COND_MASK(k_reg) {%k_reg} > +# else > +# define MASK_GPR rax > +# define LOOP_REG rdi > + > +# define COND_MASK(k_reg) > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + > +# if CHAR_PER_VEC == 64 > +# define LAST_VEC_OFFSET (VEC_SIZE * 3) > +# define TESTZ(reg) incq %VGPR_SZ(reg, 64) > +# else > + > +# if CHAR_PER_VEC == 32 > +# define TESTZ(reg) incl %VGPR_SZ(reg, 32) > +# elif CHAR_PER_VEC == 16 > +# define TESTZ(reg) incw %VGPR_SZ(reg, 16) > +# else > +# define TESTZ(reg) incb %VGPR_SZ(reg, 8) > +# endif > + > +# define LAST_VEC_OFFSET (VEC_SIZE * 2) > +# endif > + > +# define VMATCH VMM(0) > + > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (STRCHR, 6) > + /* Broadcast CHAR to VEC_0. */ > + VPBROADCAST %esi, %VMATCH > movl %edi, %eax > andl $(PAGE_SIZE - 1), %eax > /* Check if we cross page boundary with one vector load. > @@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5) > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > ja L(cross_page_boundary) > > + > /* Check the first VEC_SIZE bytes. Search for both CHAR and the > null bytes. */ > - VMOVU (%rdi), %YMM1 > - > + VMOVU (%rdi), %VMM(1) > /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPTESTN %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + vpxorq %VMM(1), %VMATCH, %VMM(2) > + VPMINU %VMM(2), %VMM(1), %VMM(2) > + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRAX > +# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL > + /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so > + that all logic for match/null in first VEC first in 1x cache > + lines. This has a slight cost to larger sizes. */ > + bsf %VRAX, %VRAX > + jz L(aligned_more) > +# else > + test %VRAX, %VRAX > jz L(aligned_more) > - tzcntl %eax, %eax > + bsf %VRAX, %VRAX > +# endif > # ifndef USE_AS_STRCHRNUL > /* Found CHAR or the null byte. */ > cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG > @@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5) > # endif > ret > > - > - > - .p2align 4,, 10 > -L(first_vec_x4): > -# ifndef USE_AS_STRCHRNUL > - /* Check to see if first match was CHAR (k0) or null (k1). */ > - kmovd %k0, %eax > - tzcntl %eax, %eax > - kmovd %k1, %ecx > - /* bzhil will not be 0 if first match was null. */ > - bzhil %eax, %ecx, %ecx > - jne L(zero) > -# else > - /* Combine CHAR and null matches. */ > - kord %k0, %k1, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > -# endif > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > - bytes. */ > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > - ret > - > # ifndef USE_AS_STRCHRNUL > L(zero): > xorl %eax, %eax > ret > # endif > > - > - .p2align 4 > + .p2align 4,, 2 > +L(first_vec_x3): > + subq $-(VEC_SIZE * 2), %rdi > +# if VEC_SIZE == 32 > + /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32. > + For VEC_SIZE == 64 the registers don't match. */ > +L(last_vec_x2): > +# endif > L(first_vec_x1): > /* Use bsf here to save 1-byte keeping keeping the block in 1x > fetch block. eax guranteed non-zero. */ > - bsfl %eax, %eax > + bsf %VRCX, %VRCX > # ifndef USE_AS_STRCHRNUL > - /* Found CHAR or the null byte. */ > - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + /* Found CHAR or the null byte. */ > + cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG > jne L(zero) > - > # endif > /* NB: Multiply sizeof char type (1 or 4) to get the number of > bytes. */ > - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > + leaq (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax > ret > > - .p2align 4,, 10 > + .p2align 4,, 2 > +L(first_vec_x4): > + subq $-(VEC_SIZE * 2), %rdi > L(first_vec_x2): > # ifndef USE_AS_STRCHRNUL > /* Check to see if first match was CHAR (k0) or null (k1). */ > - kmovd %k0, %eax > - tzcntl %eax, %eax > - kmovd %k1, %ecx > + KMOV %k0, %VRAX > + tzcnt %VRAX, %VRAX > + KMOV %k1, %VRCX > /* bzhil will not be 0 if first match was null. */ > - bzhil %eax, %ecx, %ecx > + bzhi %VRAX, %VRCX, %VRCX > jne L(zero) > # else > /* Combine CHAR and null matches. */ > - kord %k0, %k1, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > + KOR %k0, %k1, %k0 > + KMOV %k0, %VRAX > + bsf %VRAX, %VRAX > # endif > /* NB: Multiply sizeof char type (1 or 4) to get the number of > bytes. */ > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > ret > > - .p2align 4,, 10 > -L(first_vec_x3): > - /* Use bsf here to save 1-byte keeping keeping the block in 1x > - fetch block. eax guranteed non-zero. */ > - bsfl %eax, %eax > -# ifndef USE_AS_STRCHRNUL > - /* Found CHAR or the null byte. */ > - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > - jne L(zero) > +# ifdef USE_AS_STRCHRNUL > + /* We use this as a hook to get imm8 encoding for the jmp to > + L(page_cross_boundary). This allows the hot case of a > + match/null-term in first VEC to fit entirely in 1 cache > + line. */ > +L(cross_page_boundary): > + jmp L(cross_page_boundary_real) > # endif > - /* NB: Multiply sizeof char type (1 or 4) to get the number of > - bytes. */ > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > - ret > > .p2align 4 > L(aligned_more): > +L(cross_page_continue): > /* Align data to VEC_SIZE. */ > andq $-VEC_SIZE, %rdi > -L(cross_page_continue): > - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since > - data is only aligned to VEC_SIZE. Use two alternating methods > - for checking VEC to balance latency and port contention. */ > > - /* This method has higher latency but has better port > - distribution. */ > - VMOVA (VEC_SIZE)(%rdi), %YMM1 > + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time > + since data is only aligned to VEC_SIZE. Use two alternating > + methods for checking VEC to balance latency and port > + contention. */ > + > + /* Method(1) with 8c latency: > + For VEC_SIZE == 32: > + p0 * 1.83, p1 * 0.83, p5 * 1.33 > + For VEC_SIZE == 64: > + p0 * 2.50, p1 * 0.00, p5 * 1.50 */ > + VMOVA (VEC_SIZE)(%rdi), %VMM(1) > /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPTESTN %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + vpxorq %VMM(1), %VMATCH, %VMM(2) > + VPMINU %VMM(2), %VMM(1), %VMM(2) > + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x1) > > - /* This method has higher latency but has better port > - distribution. */ > - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 > - /* Each bit in K0 represents a CHAR in YMM1. */ > - VPCMP $0, %YMM1, %YMM0, %k0 > - /* Each bit in K1 represents a CHAR in YMM1. */ > - VPTESTN %YMM1, %YMM1, %k1 > - kortestd %k0, %k1 > + /* Method(2) with 6c latency: > + For VEC_SIZE == 32: > + p0 * 1.00, p1 * 0.00, p5 * 2.00 > + For VEC_SIZE == 64: > + p0 * 1.00, p1 * 0.00, p5 * 2.00 */ > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(1) > + /* Each bit in K0 represents a CHAR in VEC_1. */ > + VPCMPEQ %VMM(1), %VMATCH, %k0 > + /* Each bit in K1 represents a CHAR in VEC_1. */ > + VPTESTN %VMM(1), %VMM(1), %k1 > + KORTEST %k0, %k1 > jnz L(first_vec_x2) > > - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 > + /* By swapping between Method 1/2 we get more fair port > + distrubition and better throughput. */ > + > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(1) > /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPTESTN %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + vpxorq %VMM(1), %VMATCH, %VMM(2) > + VPMINU %VMM(2), %VMM(1), %VMM(2) > + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x3) > > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > - /* Each bit in K0 represents a CHAR in YMM1. */ > - VPCMP $0, %YMM1, %YMM0, %k0 > - /* Each bit in K1 represents a CHAR in YMM1. */ > - VPTESTN %YMM1, %YMM1, %k1 > - kortestd %k0, %k1 > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + /* Each bit in K0 represents a CHAR in VEC_1. */ > + VPCMPEQ %VMM(1), %VMATCH, %k0 > + /* Each bit in K1 represents a CHAR in VEC_1. */ > + VPTESTN %VMM(1), %VMM(1), %k1 > + KORTEST %k0, %k1 > jnz L(first_vec_x4) > > /* Align data to VEC_SIZE * 4 for the loop. */ > +# if VEC_SIZE == 64 > + /* Use rax for the loop reg as it allows to the loop to fit in > + exactly 2-cache-lines. (more efficient imm32 + gpr > + encoding). */ > + leaq (VEC_SIZE)(%rdi), %rax > + /* No partial register stalls on evex512 processors. */ > + xorb %al, %al > +# else > + /* For VEC_SIZE == 32 continue using rdi for loop reg so we can > + reuse more code and save space. */ > addq $VEC_SIZE, %rdi > andq $-(VEC_SIZE * 4), %rdi > - > +# endif > .p2align 4 > L(loop_4x_vec): > - /* Check 4x VEC at a time. No penalty to imm32 offset with evex > - encoding. */ > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > - VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 > - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > - VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 > - > - /* For YMM1 and YMM3 use xor to set the CHARs matching esi to > + /* Check 4x VEC at a time. No penalty for imm32 offset with evex > + encoding (if offset % VEC_SIZE == 0). */ > + VMOVA (VEC_SIZE * 4)(%LOOP_REG), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%LOOP_REG), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%LOOP_REG), %VMM(3) > + VMOVA (VEC_SIZE * 7)(%LOOP_REG), %VMM(4) > + > + /* Collect bits where VEC_1 does NOT match esi. This is later > + use to mask of results (getting not matches allows us to > + save an instruction on combining). */ > + VPCMP $4, %VMATCH, %VMM(1), %k1 > + > + /* Two methods for loop depending on VEC_SIZE. This is because > + with zmm registers VPMINU can only run on p0 (as opposed to > + p0/p1 for ymm) so it is less prefered. */ > +# if VEC_SIZE == 32 > + /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to > zero. */ > - vpxorq %YMM1, %YMM0, %YMM5 > - /* For YMM2 and YMM4 cmp not equals to CHAR and store result in > - k register. Its possible to save either 1 or 2 instructions > - using cmp no equals method for either YMM1 or YMM1 and YMM3 > - respectively but bottleneck on p5 makes it not worth it. */ > - VPCMP $4, %YMM0, %YMM2, %k2 > - vpxorq %YMM3, %YMM0, %YMM7 > - VPCMP $4, %YMM0, %YMM4, %k4 > - > - /* Use min to select all zeros from either xor or end of string). > - */ > - VPMINU %YMM1, %YMM5, %YMM1 > - VPMINU %YMM3, %YMM7, %YMM3 > + vpxorq %VMM(2), %VMATCH, %VMM(6) > + vpxorq %VMM(3), %VMATCH, %VMM(7) > > - /* Use min + zeromask to select for zeros. Since k2 and k4 will > - have 0 as positions that matched with CHAR which will set > - zero in the corresponding destination bytes in YMM2 / YMM4. > - */ > - VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} > - VPMINU %YMM3, %YMM4, %YMM4 > - VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} > - > - VPTESTN %YMM4, %YMM4, %k1 > - kmovd %k1, %ecx > - subq $-(VEC_SIZE * 4), %rdi > - testl %ecx, %ecx > + /* Find non-matches in VEC_4 while combining with non-matches > + from VEC_1. NB: Try and use masked predicate execution on > + instructions that have mask result as it has no latency > + penalty. */ > + VPCMP $4, %VMATCH, %VMM(4), %k4{%k1} > + > + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ > + VPMINU %VMM(1), %VMM(2), %VMM(2) > + > + /* Use min to select all zeros from either xor or end of > + string). */ > + VPMINU %VMM(3), %VMM(7), %VMM(3) > + VPMINU %VMM(2), %VMM(6), %VMM(2) > + > + /* Combined zeros from VEC_2 / VEC_3 (search for null term). */ > + VPMINU %VMM(3), %VMM(4), %VMM(4) > + > + /* Combined zeros from VEC_2 / VEC_4 (this has all null term and > + esi matches for VEC_2 / VEC_3). */ > + VPMINU %VMM(2), %VMM(4), %VMM(4) > +# else > + /* Collect non-matches for VEC_2. */ > + VPCMP $4, %VMM(2), %VMATCH, %k2 > + > + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ > + VPMINU %VMM(1), %VMM(2), %VMM(2) > + > + /* Find non-matches in VEC_3/VEC_4 while combining with non- > + matches from VEC_1/VEC_2 respectively. */ > + VPCMP $4, %VMM(3), %VMATCH, %k3{%k1} > + VPCMP $4, %VMM(4), %VMATCH, %k4{%k2} > + > + /* Finish combining zeros in all VECs. */ > + VPMINU %VMM(3), %VMM(4), %VMM(4) > + > + /* Combine in esi matches for VEC_3 (if there was a match with > + esi, the corresponding bit in %k3 is zero so the > + VPMINU_MASKZ will have a zero in the result). NB: This make > + the VPMINU 3c latency. The only way to avoid it is to > + createa a 12c dependency chain on all the `VPCMP $4, ...` > + which has higher total latency. */ > + VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z} > +# endif > + VPTEST %VMM(4), %VMM(4), %k0{%k4} > + KMOV %k0, %VRDX > + subq $-(VEC_SIZE * 4), %LOOP_REG > + > + /* TESTZ is inc using the proper register width depending on > + CHAR_PER_VEC. An esi match or null-term match leaves a zero- > + bit in rdx so inc won't overflow and won't be zero. */ > + TESTZ (rdx) > jz L(loop_4x_vec) > > - VPTESTN %YMM1, %YMM1, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x1) > + VPTEST %VMM(1), %VMM(1), %k0{%k1} > + KMOV %k0, %VGPR(MASK_GPR) > + TESTZ (MASK_GPR) > +# if VEC_SIZE == 32 > + /* We can reuse the return code in page_cross logic for VEC_SIZE > + == 32. */ > + jnz L(last_vec_x1_vec_size32) > +# else > + jnz L(last_vec_x1_vec_size64) > +# endif > + > > - VPTESTN %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + /* COND_MASK integates the esi matches for VEC_SIZE == 64. For > + VEC_SIZE == 32 they are already integrated. */ > + VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2) > + KMOV %k0, %VRCX > + TESTZ (rcx) > jnz L(last_vec_x2) > > - VPTESTN %YMM3, %YMM3, %k0 > - kmovd %k0, %eax > - /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ > -# ifdef USE_AS_WCSCHR > - sall $8, %ecx > - orl %ecx, %eax > - bsfl %eax, %eax > + VPTEST %VMM(3), %VMM(3), %k0 COND_MASK(k3) > + KMOV %k0, %VRCX > +# if CHAR_PER_VEC == 64 > + TESTZ (rcx) > + jnz L(last_vec_x3) > # else > - salq $32, %rcx > - orq %rcx, %rax > - bsfq %rax, %rax > + salq $CHAR_PER_VEC, %rdx > + TESTZ (rcx) > + orq %rcx, %rdx > # endif > + > + bsfq %rdx, %rdx > + > # ifndef USE_AS_STRCHRNUL > /* Check if match was CHAR or null. */ > - cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + cmp (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG > jne L(zero_end) > # endif > /* NB: Multiply sizeof char type (1 or 4) to get the number of > bytes. */ > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + leaq (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 4,, 8 > -L(last_vec_x1): > - bsfl %eax, %eax > -# ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. > - */ > - leaq (%rdi, %rax, CHAR_SIZE), %rax > -# else > - addq %rdi, %rax > +# ifndef USE_AS_STRCHRNUL > +L(zero_end): > + xorl %eax, %eax > + ret > # endif > > -# ifndef USE_AS_STRCHRNUL > + > + /* Seperate return label for last VEC1 because for VEC_SIZE == > + 32 we can reuse return code in L(page_cross) but VEC_SIZE == > + 64 has mismatched registers. */ > +# if VEC_SIZE == 64 > + .p2align 4,, 8 > +L(last_vec_x1_vec_size64): > + bsf %VRCX, %VRCX > +# ifndef USE_AS_STRCHRNUL > /* Check if match was null. */ > - cmp (%rax), %CHAR_REG > + cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG > jne L(zero_end) > -# endif > - > +# endif > +# ifdef USE_AS_WCSCHR > + /* NB: Multiply wchar_t count by 4 to get the number of bytes. > + */ > + leaq (%rax, %rcx, CHAR_SIZE), %rax > +# else > + addq %rcx, %rax > +# endif > ret > > + /* Since we can't combine the last 2x matches for CHAR_PER_VEC > + == 64 we need return label for last VEC3. */ > +# if CHAR_PER_VEC == 64 > .p2align 4,, 8 > +L(last_vec_x3): > + addq $VEC_SIZE, %LOOP_REG > +# endif > + > + /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't > + reuse L(first_vec_x3) due to register mismatch. */ > L(last_vec_x2): > - bsfl %eax, %eax > -# ifndef USE_AS_STRCHRNUL > + bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR) > +# ifndef USE_AS_STRCHRNUL > /* Check if match was null. */ > - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG > + cmp (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG > jne L(zero_end) > -# endif > +# endif > /* NB: Multiply sizeof char type (1 or 4) to get the number of > bytes. */ > - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > + leaq (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax > ret > +# endif > > - /* Cold case for crossing page with first load. */ > - .p2align 4,, 8 > + /* Cold case for crossing page with first load. */ > + .p2align 4,, 10 > +# ifndef USE_AS_STRCHRNUL > L(cross_page_boundary): > - movq %rdi, %rdx > +# endif > +L(cross_page_boundary_real): > /* Align rdi. */ > - andq $-VEC_SIZE, %rdi > - VMOVA (%rdi), %YMM1 > - /* Leaves only CHARS matching esi as 0. */ > - vpxorq %YMM1, %YMM0, %YMM2 > - VPMINU %YMM2, %YMM1, %YMM2 > - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ > - VPTESTN %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > + xorq %rdi, %rax > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) > + /* Use high latency method of getting matches to save code size. > + */ > + > + /* K1 has 1s where VEC(1) does NOT match esi. */ > + VPCMP $4, %VMM(1), %VMATCH, %k1 > + /* K0 has ones where K1 is 1 (non-match with esi), and non-zero > + (null). */ > + VPTEST %VMM(1), %VMM(1), %k0{%k1} > + KMOV %k0, %VRAX > /* Remove the leading bits. */ > # ifdef USE_AS_WCSCHR > - movl %edx, %SHIFT_REG > + movl %edi, %VGPR_SZ(SHIFT_REG, 32) > /* NB: Divide shift count by 4 since each bit in K1 represent 4 > bytes. */ > - sarl $2, %SHIFT_REG > - andl $(CHAR_PER_VEC - 1), %SHIFT_REG > + sarl $2, %VGPR_SZ(SHIFT_REG, 32) > + andl $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32) > + > + /* if wcsrchr we need to reverse matches as we can't rely on > + signed shift to bring in ones. There is not sarx for > + gpr8/16. Also not we can't use inc here as the lower bits > + represent matches out of range so we can't rely on overflow. > + */ > + xorl $((1 << CHAR_PER_VEC)- 1), %eax > +# endif > + /* Use arithmatic shift so that leading 1s are filled in. */ > + sarx %VGPR(SHIFT_REG), %VRAX, %VRAX > + /* If eax is all ones then no matches for esi or NULL. */ > + > +# ifdef USE_AS_WCSCHR > + test %VRAX, %VRAX > +# else > + inc %VRAX > # endif > - sarxl %SHIFT_REG, %eax, %eax > - /* If eax is zero continue. */ > - testl %eax, %eax > jz L(cross_page_continue) > - bsfl %eax, %eax > > + .p2align 4,, 10 > +L(last_vec_x1_vec_size32): > + bsf %VRAX, %VRAX > # ifdef USE_AS_WCSCHR > - /* NB: Multiply wchar_t count by 4 to get the number of > - bytes. */ > - leaq (%rdx, %rax, CHAR_SIZE), %rax > + /* NB: Multiply wchar_t count by 4 to get the number of bytes. > + */ > + leaq (%rdi, %rax, CHAR_SIZE), %rax > # else > - addq %rdx, %rax > + addq %rdi, %rax > # endif > # ifndef USE_AS_STRCHRNUL > /* Check to see if match was CHAR or null. */ > cmp (%rax), %CHAR_REG > - je L(cross_page_ret) > -L(zero_end): > - xorl %eax, %eax > -L(cross_page_ret): > + jne L(zero_end_0) > # endif > ret > +# ifndef USE_AS_STRCHRNUL > +L(zero_end_0): > + xorl %eax, %eax > + ret > +# endif > > END (STRCHR) > #endif > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S index a1c15c4419..c2a0d112f7 100644 --- a/sysdeps/x86_64/multiarch/strchr-evex.S +++ b/sysdeps/x86_64/multiarch/strchr-evex.S @@ -26,48 +26,75 @@ # define STRCHR __strchr_evex # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif # ifdef USE_AS_WCSCHR # define VPBROADCAST vpbroadcastd -# define VPCMP vpcmpd +# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud # define CHAR_REG esi -# define SHIFT_REG ecx +# define SHIFT_REG rcx # define CHAR_SIZE 4 + +# define USE_WIDE_CHAR # else # define VPBROADCAST vpbroadcastb -# define VPCMP vpcmpb +# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb # define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub # define CHAR_REG sil -# define SHIFT_REG edx +# define SHIFT_REG rdi # define CHAR_SIZE 1 # endif -# define XMMZERO xmm16 - -# define YMMZERO ymm16 -# define YMM0 ymm17 -# define YMM1 ymm18 -# define YMM2 ymm19 -# define YMM3 ymm20 -# define YMM4 ymm21 -# define YMM5 ymm22 -# define YMM6 ymm23 -# define YMM7 ymm24 -# define YMM8 ymm25 - -# define VEC_SIZE 32 -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section .text.evex,"ax",@progbits -ENTRY_P2ALIGN (STRCHR, 5) - /* Broadcast CHAR to YMM0. */ - VPBROADCAST %esi, %YMM0 +# include "reg-macros.h" + +# if VEC_SIZE == 64 +# define MASK_GPR rcx +# define LOOP_REG rax + +# define COND_MASK(k_reg) {%k_reg} +# else +# define MASK_GPR rax +# define LOOP_REG rdi + +# define COND_MASK(k_reg) +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + +# if CHAR_PER_VEC == 64 +# define LAST_VEC_OFFSET (VEC_SIZE * 3) +# define TESTZ(reg) incq %VGPR_SZ(reg, 64) +# else + +# if CHAR_PER_VEC == 32 +# define TESTZ(reg) incl %VGPR_SZ(reg, 32) +# elif CHAR_PER_VEC == 16 +# define TESTZ(reg) incw %VGPR_SZ(reg, 16) +# else +# define TESTZ(reg) incb %VGPR_SZ(reg, 8) +# endif + +# define LAST_VEC_OFFSET (VEC_SIZE * 2) +# endif + +# define VMATCH VMM(0) + +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRCHR, 6) + /* Broadcast CHAR to VEC_0. */ + VPBROADCAST %esi, %VMATCH movl %edi, %eax andl $(PAGE_SIZE - 1), %eax /* Check if we cross page boundary with one vector load. @@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5) cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page_boundary) + /* Check the first VEC_SIZE bytes. Search for both CHAR and the null bytes. */ - VMOVU (%rdi), %YMM1 - + VMOVU (%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRAX +# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL + /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so + that all logic for match/null in first VEC first in 1x cache + lines. This has a slight cost to larger sizes. */ + bsf %VRAX, %VRAX + jz L(aligned_more) +# else + test %VRAX, %VRAX jz L(aligned_more) - tzcntl %eax, %eax + bsf %VRAX, %VRAX +# endif # ifndef USE_AS_STRCHRNUL /* Found CHAR or the null byte. */ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG @@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5) # endif ret - - - .p2align 4,, 10 -L(first_vec_x4): -# ifndef USE_AS_STRCHRNUL - /* Check to see if first match was CHAR (k0) or null (k1). */ - kmovd %k0, %eax - tzcntl %eax, %eax - kmovd %k1, %ecx - /* bzhil will not be 0 if first match was null. */ - bzhil %eax, %ecx, %ecx - jne L(zero) -# else - /* Combine CHAR and null matches. */ - kord %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax -# endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax - ret - # ifndef USE_AS_STRCHRNUL L(zero): xorl %eax, %eax ret # endif - - .p2align 4 + .p2align 4,, 2 +L(first_vec_x3): + subq $-(VEC_SIZE * 2), %rdi +# if VEC_SIZE == 32 + /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32. + For VEC_SIZE == 64 the registers don't match. */ +L(last_vec_x2): +# endif L(first_vec_x1): /* Use bsf here to save 1-byte keeping keeping the block in 1x fetch block. eax guranteed non-zero. */ - bsfl %eax, %eax + bsf %VRCX, %VRCX # ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG jne L(zero) - # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + leaq (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax ret - .p2align 4,, 10 + .p2align 4,, 2 +L(first_vec_x4): + subq $-(VEC_SIZE * 2), %rdi L(first_vec_x2): # ifndef USE_AS_STRCHRNUL /* Check to see if first match was CHAR (k0) or null (k1). */ - kmovd %k0, %eax - tzcntl %eax, %eax - kmovd %k1, %ecx + KMOV %k0, %VRAX + tzcnt %VRAX, %VRAX + KMOV %k1, %VRCX /* bzhil will not be 0 if first match was null. */ - bzhil %eax, %ecx, %ecx + bzhi %VRAX, %VRCX, %VRCX jne L(zero) # else /* Combine CHAR and null matches. */ - kord %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax + KOR %k0, %k1, %k0 + KMOV %k0, %VRAX + bsf %VRAX, %VRAX # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 10 -L(first_vec_x3): - /* Use bsf here to save 1-byte keeping keeping the block in 1x - fetch block. eax guranteed non-zero. */ - bsfl %eax, %eax -# ifndef USE_AS_STRCHRNUL - /* Found CHAR or the null byte. */ - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG - jne L(zero) +# ifdef USE_AS_STRCHRNUL + /* We use this as a hook to get imm8 encoding for the jmp to + L(page_cross_boundary). This allows the hot case of a + match/null-term in first VEC to fit entirely in 1 cache + line. */ +L(cross_page_boundary): + jmp L(cross_page_boundary_real) # endif - /* NB: Multiply sizeof char type (1 or 4) to get the number of - bytes. */ - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax - ret .p2align 4 L(aligned_more): +L(cross_page_continue): /* Align data to VEC_SIZE. */ andq $-VEC_SIZE, %rdi -L(cross_page_continue): - /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since - data is only aligned to VEC_SIZE. Use two alternating methods - for checking VEC to balance latency and port contention. */ - /* This method has higher latency but has better port - distribution. */ - VMOVA (VEC_SIZE)(%rdi), %YMM1 + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. Use two alternating + methods for checking VEC to balance latency and port + contention. */ + + /* Method(1) with 8c latency: + For VEC_SIZE == 32: + p0 * 1.83, p1 * 0.83, p5 * 1.33 + For VEC_SIZE == 64: + p0 * 2.50, p1 * 0.00, p5 * 1.50 */ + VMOVA (VEC_SIZE)(%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x1) - /* This method has higher latency but has better port - distribution. */ - VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 - /* Each bit in K0 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMM0, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPTESTN %YMM1, %YMM1, %k1 - kortestd %k0, %k1 + /* Method(2) with 6c latency: + For VEC_SIZE == 32: + p0 * 1.00, p1 * 0.00, p5 * 2.00 + For VEC_SIZE == 64: + p0 * 1.00, p1 * 0.00, p5 * 2.00 */ + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(1) + /* Each bit in K0 represents a CHAR in VEC_1. */ + VPCMPEQ %VMM(1), %VMATCH, %k0 + /* Each bit in K1 represents a CHAR in VEC_1. */ + VPTESTN %VMM(1), %VMM(1), %k1 + KORTEST %k0, %k1 jnz L(first_vec_x2) - VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 + /* By swapping between Method 1/2 we get more fair port + distrubition and better throughput. */ + + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(1) /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + vpxorq %VMM(1), %VMATCH, %VMM(2) + VPMINU %VMM(2), %VMM(1), %VMM(2) + /* Each bit in K0 represents a CHAR or a null byte in VEC_1. */ + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX jnz L(first_vec_x3) - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 - /* Each bit in K0 represents a CHAR in YMM1. */ - VPCMP $0, %YMM1, %YMM0, %k0 - /* Each bit in K1 represents a CHAR in YMM1. */ - VPTESTN %YMM1, %YMM1, %k1 - kortestd %k0, %k1 + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + /* Each bit in K0 represents a CHAR in VEC_1. */ + VPCMPEQ %VMM(1), %VMATCH, %k0 + /* Each bit in K1 represents a CHAR in VEC_1. */ + VPTESTN %VMM(1), %VMM(1), %k1 + KORTEST %k0, %k1 jnz L(first_vec_x4) /* Align data to VEC_SIZE * 4 for the loop. */ +# if VEC_SIZE == 64 + /* Use rax for the loop reg as it allows to the loop to fit in + exactly 2-cache-lines. (more efficient imm32 + gpr + encoding). */ + leaq (VEC_SIZE)(%rdi), %rax + /* No partial register stalls on evex512 processors. */ + xorb %al, %al +# else + /* For VEC_SIZE == 32 continue using rdi for loop reg so we can + reuse more code and save space. */ addq $VEC_SIZE, %rdi andq $-(VEC_SIZE * 4), %rdi - +# endif .p2align 4 L(loop_4x_vec): - /* Check 4x VEC at a time. No penalty to imm32 offset with evex - encoding. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 - VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 - VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 - - /* For YMM1 and YMM3 use xor to set the CHARs matching esi to + /* Check 4x VEC at a time. No penalty for imm32 offset with evex + encoding (if offset % VEC_SIZE == 0). */ + VMOVA (VEC_SIZE * 4)(%LOOP_REG), %VMM(1) + VMOVA (VEC_SIZE * 5)(%LOOP_REG), %VMM(2) + VMOVA (VEC_SIZE * 6)(%LOOP_REG), %VMM(3) + VMOVA (VEC_SIZE * 7)(%LOOP_REG), %VMM(4) + + /* Collect bits where VEC_1 does NOT match esi. This is later + use to mask of results (getting not matches allows us to + save an instruction on combining). */ + VPCMP $4, %VMATCH, %VMM(1), %k1 + + /* Two methods for loop depending on VEC_SIZE. This is because + with zmm registers VPMINU can only run on p0 (as opposed to + p0/p1 for ymm) so it is less prefered. */ +# if VEC_SIZE == 32 + /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to zero. */ - vpxorq %YMM1, %YMM0, %YMM5 - /* For YMM2 and YMM4 cmp not equals to CHAR and store result in - k register. Its possible to save either 1 or 2 instructions - using cmp no equals method for either YMM1 or YMM1 and YMM3 - respectively but bottleneck on p5 makes it not worth it. */ - VPCMP $4, %YMM0, %YMM2, %k2 - vpxorq %YMM3, %YMM0, %YMM7 - VPCMP $4, %YMM0, %YMM4, %k4 - - /* Use min to select all zeros from either xor or end of string). - */ - VPMINU %YMM1, %YMM5, %YMM1 - VPMINU %YMM3, %YMM7, %YMM3 + vpxorq %VMM(2), %VMATCH, %VMM(6) + vpxorq %VMM(3), %VMATCH, %VMM(7) - /* Use min + zeromask to select for zeros. Since k2 and k4 will - have 0 as positions that matched with CHAR which will set - zero in the corresponding destination bytes in YMM2 / YMM4. - */ - VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} - VPMINU %YMM3, %YMM4, %YMM4 - VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} - - VPTESTN %YMM4, %YMM4, %k1 - kmovd %k1, %ecx - subq $-(VEC_SIZE * 4), %rdi - testl %ecx, %ecx + /* Find non-matches in VEC_4 while combining with non-matches + from VEC_1. NB: Try and use masked predicate execution on + instructions that have mask result as it has no latency + penalty. */ + VPCMP $4, %VMATCH, %VMM(4), %k4{%k1} + + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ + VPMINU %VMM(1), %VMM(2), %VMM(2) + + /* Use min to select all zeros from either xor or end of + string). */ + VPMINU %VMM(3), %VMM(7), %VMM(3) + VPMINU %VMM(2), %VMM(6), %VMM(2) + + /* Combined zeros from VEC_2 / VEC_3 (search for null term). */ + VPMINU %VMM(3), %VMM(4), %VMM(4) + + /* Combined zeros from VEC_2 / VEC_4 (this has all null term and + esi matches for VEC_2 / VEC_3). */ + VPMINU %VMM(2), %VMM(4), %VMM(4) +# else + /* Collect non-matches for VEC_2. */ + VPCMP $4, %VMM(2), %VMATCH, %k2 + + /* Combined zeros from VEC_1 / VEC_2 (search for null term). */ + VPMINU %VMM(1), %VMM(2), %VMM(2) + + /* Find non-matches in VEC_3/VEC_4 while combining with non- + matches from VEC_1/VEC_2 respectively. */ + VPCMP $4, %VMM(3), %VMATCH, %k3{%k1} + VPCMP $4, %VMM(4), %VMATCH, %k4{%k2} + + /* Finish combining zeros in all VECs. */ + VPMINU %VMM(3), %VMM(4), %VMM(4) + + /* Combine in esi matches for VEC_3 (if there was a match with + esi, the corresponding bit in %k3 is zero so the + VPMINU_MASKZ will have a zero in the result). NB: This make + the VPMINU 3c latency. The only way to avoid it is to + createa a 12c dependency chain on all the `VPCMP $4, ...` + which has higher total latency. */ + VPMINU %VMM(2), %VMM(4), %VMM(4){%k3}{z} +# endif + VPTEST %VMM(4), %VMM(4), %k0{%k4} + KMOV %k0, %VRDX + subq $-(VEC_SIZE * 4), %LOOP_REG + + /* TESTZ is inc using the proper register width depending on + CHAR_PER_VEC. An esi match or null-term match leaves a zero- + bit in rdx so inc won't overflow and won't be zero. */ + TESTZ (rdx) jz L(loop_4x_vec) - VPTESTN %YMM1, %YMM1, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x1) + VPTEST %VMM(1), %VMM(1), %k0{%k1} + KMOV %k0, %VGPR(MASK_GPR) + TESTZ (MASK_GPR) +# if VEC_SIZE == 32 + /* We can reuse the return code in page_cross logic for VEC_SIZE + == 32. */ + jnz L(last_vec_x1_vec_size32) +# else + jnz L(last_vec_x1_vec_size64) +# endif + - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + /* COND_MASK integates the esi matches for VEC_SIZE == 64. For + VEC_SIZE == 32 they are already integrated. */ + VPTEST %VMM(2), %VMM(2), %k0 COND_MASK(k2) + KMOV %k0, %VRCX + TESTZ (rcx) jnz L(last_vec_x2) - VPTESTN %YMM3, %YMM3, %k0 - kmovd %k0, %eax - /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ -# ifdef USE_AS_WCSCHR - sall $8, %ecx - orl %ecx, %eax - bsfl %eax, %eax + VPTEST %VMM(3), %VMM(3), %k0 COND_MASK(k3) + KMOV %k0, %VRCX +# if CHAR_PER_VEC == 64 + TESTZ (rcx) + jnz L(last_vec_x3) # else - salq $32, %rcx - orq %rcx, %rax - bsfq %rax, %rax + salq $CHAR_PER_VEC, %rdx + TESTZ (rcx) + orq %rcx, %rdx # endif + + bsfq %rdx, %rdx + # ifndef USE_AS_STRCHRNUL /* Check if match was CHAR or null. */ - cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG jne L(zero_end) # endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + leaq (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax ret - .p2align 4,, 8 -L(last_vec_x1): - bsfl %eax, %eax -# ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of bytes. - */ - leaq (%rdi, %rax, CHAR_SIZE), %rax -# else - addq %rdi, %rax +# ifndef USE_AS_STRCHRNUL +L(zero_end): + xorl %eax, %eax + ret # endif -# ifndef USE_AS_STRCHRNUL + + /* Seperate return label for last VEC1 because for VEC_SIZE == + 32 we can reuse return code in L(page_cross) but VEC_SIZE == + 64 has mismatched registers. */ +# if VEC_SIZE == 64 + .p2align 4,, 8 +L(last_vec_x1_vec_size64): + bsf %VRCX, %VRCX +# ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (%rax), %CHAR_REG + cmp (%rax, %rcx, CHAR_SIZE), %CHAR_REG jne L(zero_end) -# endif - +# endif +# ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rax, %rcx, CHAR_SIZE), %rax +# else + addq %rcx, %rax +# endif ret + /* Since we can't combine the last 2x matches for CHAR_PER_VEC + == 64 we need return label for last VEC3. */ +# if CHAR_PER_VEC == 64 .p2align 4,, 8 +L(last_vec_x3): + addq $VEC_SIZE, %LOOP_REG +# endif + + /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't + reuse L(first_vec_x3) due to register mismatch. */ L(last_vec_x2): - bsfl %eax, %eax -# ifndef USE_AS_STRCHRNUL + bsf %VGPR(MASK_GPR), %VGPR(MASK_GPR) +# ifndef USE_AS_STRCHRNUL /* Check if match was null. */ - cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG + cmp (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG jne L(zero_end) -# endif +# endif /* NB: Multiply sizeof char type (1 or 4) to get the number of bytes. */ - leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + leaq (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax ret +# endif - /* Cold case for crossing page with first load. */ - .p2align 4,, 8 + /* Cold case for crossing page with first load. */ + .p2align 4,, 10 +# ifndef USE_AS_STRCHRNUL L(cross_page_boundary): - movq %rdi, %rdx +# endif +L(cross_page_boundary_real): /* Align rdi. */ - andq $-VEC_SIZE, %rdi - VMOVA (%rdi), %YMM1 - /* Leaves only CHARS matching esi as 0. */ - vpxorq %YMM1, %YMM0, %YMM2 - VPMINU %YMM2, %YMM1, %YMM2 - /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ - VPTESTN %YMM2, %YMM2, %k0 - kmovd %k0, %eax + xorq %rdi, %rax + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + /* Use high latency method of getting matches to save code size. + */ + + /* K1 has 1s where VEC(1) does NOT match esi. */ + VPCMP $4, %VMM(1), %VMATCH, %k1 + /* K0 has ones where K1 is 1 (non-match with esi), and non-zero + (null). */ + VPTEST %VMM(1), %VMM(1), %k0{%k1} + KMOV %k0, %VRAX /* Remove the leading bits. */ # ifdef USE_AS_WCSCHR - movl %edx, %SHIFT_REG + movl %edi, %VGPR_SZ(SHIFT_REG, 32) /* NB: Divide shift count by 4 since each bit in K1 represent 4 bytes. */ - sarl $2, %SHIFT_REG - andl $(CHAR_PER_VEC - 1), %SHIFT_REG + sarl $2, %VGPR_SZ(SHIFT_REG, 32) + andl $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32) + + /* if wcsrchr we need to reverse matches as we can't rely on + signed shift to bring in ones. There is not sarx for + gpr8/16. Also not we can't use inc here as the lower bits + represent matches out of range so we can't rely on overflow. + */ + xorl $((1 << CHAR_PER_VEC)- 1), %eax +# endif + /* Use arithmatic shift so that leading 1s are filled in. */ + sarx %VGPR(SHIFT_REG), %VRAX, %VRAX + /* If eax is all ones then no matches for esi or NULL. */ + +# ifdef USE_AS_WCSCHR + test %VRAX, %VRAX +# else + inc %VRAX # endif - sarxl %SHIFT_REG, %eax, %eax - /* If eax is zero continue. */ - testl %eax, %eax jz L(cross_page_continue) - bsfl %eax, %eax + .p2align 4,, 10 +L(last_vec_x1_vec_size32): + bsf %VRAX, %VRAX # ifdef USE_AS_WCSCHR - /* NB: Multiply wchar_t count by 4 to get the number of - bytes. */ - leaq (%rdx, %rax, CHAR_SIZE), %rax + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - addq %rdx, %rax + addq %rdi, %rax # endif # ifndef USE_AS_STRCHRNUL /* Check to see if match was CHAR or null. */ cmp (%rax), %CHAR_REG - je L(cross_page_ret) -L(zero_end): - xorl %eax, %eax -L(cross_page_ret): + jne L(zero_end_0) # endif ret +# ifndef USE_AS_STRCHRNUL +L(zero_end_0): + xorl %eax, %eax + ret +# endif END (STRCHR) #endif