Message ID | 20221019004409.3623395-3-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand |
On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Optimizations are: > 1. Use the fact that bsf(0) leaves the destination unchanged to save a > branch in short string case. > 2. Restructure code so that small strings are given the hot path. > - This is a net-zero on the benchmark suite but in general makes > sense as smaller sizes are far more common. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 4. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > The optimizations (especially for point 2) make the strnlen and > strlen code essentially incompatible so split strnlen-evex > to a new file. > > Code Size Changes: > strlen-evex.S : -23 bytes > strnlen-evex.S : -167 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > strlen-evex.S : 0.992 (No real change) > strnlen-evex.S : 0.947 > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/strlen-evex.S | 544 +++++++----------------- > sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++- > sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 +- > 3 files changed, 572 insertions(+), 404 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S > index 2109ec2f7a..487846f098 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex.S > @@ -26,466 +26,220 @@ > # define STRLEN __strlen_evex > # endif > > -# define VMOVA vmovdqa64 > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > > # ifdef USE_AS_WCSLEN > -# define VPCMP vpcmpd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNEQ vpcmpneqd > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > # define VPMINU vpminud > -# define SHIFT_REG ecx > # define CHAR_SIZE 4 > +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg > # else > -# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNEQ vpcmpneqb > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > # define VPMINU vpminub > -# define SHIFT_REG edx > # define CHAR_SIZE 1 > +# define CHAR_SIZE_SHIFT_REG(reg) > + > +# define REG_WIDTH VEC_SIZE > # endif > > -# define XMMZERO xmm16 > -# define YMMZERO ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > - > -# define VEC_SIZE 32 > -# define PAGE_SIZE 4096 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > - .section .text.evex,"ax",@progbits > -ENTRY (STRLEN) > -# ifdef USE_AS_STRNLEN > - /* Check zero length. */ > - test %RSI_LP, %RSI_LP > - jz L(zero) > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %esi, %esi > -# endif > - mov %RSI_LP, %R8_LP > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# include "reg-macros.h" > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > # endif > + > +# define XZERO VMM_128(0) > +# define VZERO VMM(0) > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (STRLEN, 6) > movl %edi, %eax > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - /* Clear high bits from edi. Only keeping bits relevant to page > - cross check. */ > + vpxorq %XZERO, %XZERO, %XZERO > andl $(PAGE_SIZE - 1), %eax > - /* Check if we may cross page boundary with one vector load. */ > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > null byte. */ > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > -# ifdef USE_AS_STRNLEN > - /* If length < CHAR_PER_VEC handle special. */ > - cmpq $CHAR_PER_VEC, %rsi > - jbe L(first_vec_x0) > -# endif > - testl %eax, %eax > + VPCMPEQ (%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jz L(aligned_more) > - tzcntl %eax, %eax > - ret > -# ifdef USE_AS_STRNLEN > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(first_vec_x0): > - /* Set bit for max len so that tzcnt will return min of max len > - and position of first match. */ > - btsq %rsi, %rax > - tzcntl %eax, %eax > - ret > -# endif > - > - .p2align 4 > -L(first_vec_x1): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal CHAR_PER_VEC(%rdi, %rax), %eax > -# endif > - ret > - > - .p2align 4 > -L(first_vec_x2): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax > -# endif > + bsf %VRAX, %VRAX > ret > > - .p2align 4 > -L(first_vec_x3): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax > -# endif > - ret > - > - .p2align 4 > + .p2align 4,, 8 > L(first_vec_x4): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > + bsf %VRAX, %VRAX > + subl %ecx, %edi > + CHAR_SIZE_SHIFT_REG (edi) > leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > -# endif > ret > > - .p2align 5 > + > + > + /* Aligned more for strnlen compares remaining length vs 2 * > + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > + going to the loop. */ > + .p2align 4,, 10 > L(aligned_more): > - movq %rdi, %rdx > - /* Align data to VEC_SIZE. */ > - andq $-(VEC_SIZE), %rdi > + movq %rdi, %rcx > + andq $(VEC_SIZE * -1), %rdi > L(cross_page_continue): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > -# ifdef USE_AS_STRNLEN > - /* + CHAR_SIZE because it simplies the logic in > - last_4x_vec_or_less. */ > - leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx > - subq %rdx, %rcx > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %ecx > -# endif > -# endif > - /* Load first VEC regardless. */ > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > -# ifdef USE_AS_STRNLEN > - /* Adjust length. If near end handle specially. */ > - subq %rcx, %rsi > - jb L(last_4x_vec_or_less) > -# endif > - kmovd %k0, %eax > - testl %eax, %eax > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x1) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - test %eax, %eax > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x2) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x3) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x4) > > - addq $VEC_SIZE, %rdi > -# ifdef USE_AS_STRNLEN > - /* Check if at last VEC_SIZE * 4 length. */ > - cmpq $(CHAR_PER_VEC * 4 - 1), %rsi > - jbe L(last_4x_vec_or_less_load) > - movl %edi, %ecx > - andl $(VEC_SIZE * 4 - 1), %ecx > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %ecx > -# endif > - /* Readjust length. */ > - addq %rcx, %rsi > -# endif > - /* Align data to VEC_SIZE * 4. */ > + subq $(VEC_SIZE * -1), %rdi > + > +# if CHAR_PER_VEC == 64 > + /* No partial register stalls on processors that we use evex512 > + on and this saves code size. */ > + xorb %dil, %dil > +# else > andq $-(VEC_SIZE * 4), %rdi > +# endif > + > + > > /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Load first VEC regardless. */ > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > -# ifdef USE_AS_STRNLEN > - /* Break if at end of length. */ > - subq $(CHAR_PER_VEC * 4), %rsi > - jb L(last_4x_vec_or_less_cmpeq) > -# endif > - /* Save some code size by microfusing VPMINU with the load. Since > - the matches in ymm2/ymm4 can only be returned if there where no > - matches in ymm1/ymm3 respectively there is no issue with overlap. > - */ > - VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 > - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > - VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k2 > > - VPCMP $0, %YMM2, %YMMZERO, %k0 > - VPCMP $0, %YMM4, %YMMZERO, %k1 > subq $-(VEC_SIZE * 4), %rdi > - kortestd %k0, %k1 > + KORTEST %k0, %k2 > jz L(loop_4x_vec) > > - /* Check if end was in first half. */ > - kmovd %k0, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - shrq $2, %rdi > -# endif > - testl %eax, %eax > - jz L(second_vec_return) > + VPTESTN %VMM(1), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > > - VPCMP $0, %YMM1, %YMMZERO, %k2 > - kmovd %k2, %edx > - /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ > -# ifdef USE_AS_WCSLEN > - sall $CHAR_PER_VEC, %eax > - orl %edx, %eax > - tzcntl %eax, %eax > -# else > - salq $CHAR_PER_VEC, %rax > - orq %rdx, %rax > - tzcntq %rax, %rax > -# endif > - addq %rdi, %rax > - ret > - > - > -# ifdef USE_AS_STRNLEN > - > -L(last_4x_vec_or_less_load): > - /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > -L(last_4x_vec_or_less_cmpeq): > - VPCMP $0, %YMM1, %YMMZERO, %k0 > - addq $(VEC_SIZE * 3), %rdi > -L(last_4x_vec_or_less): > - kmovd %k0, %eax > - /* If remaining length > VEC_SIZE * 2. This works if esi is off by > - VEC_SIZE * 4. */ > - testl $(CHAR_PER_VEC * 2), %esi > - jnz L(last_4x_vec) > - > - /* length may have been negative or positive by an offset of > - CHAR_PER_VEC * 4 depending on where this was called from. This > - fixes that. */ > - andl $(CHAR_PER_VEC * 4 - 1), %esi > - testl %eax, %eax > - jnz L(last_vec_x1_check) > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > > - /* Check the end of data. */ > - subl $CHAR_PER_VEC, %esi > - jb L(max) > + VPTESTN %VMM(3), %VMM(3), %k0 > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max) > - > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > -L(max): > - movq %r8, %rax > - ret > -# endif > - > - /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) > - in the 4x VEC loop can use 2 byte encoding. */ > - .p2align 4 > -L(second_vec_return): > - VPCMP $0, %YMM3, %YMMZERO, %k0 > - /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ > -# ifdef USE_AS_WCSLEN > - kunpckbw %k0, %k1, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > +# if CHAR_PER_VEC == 64 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k2, %VRAX > # else > - kunpckdq %k0, %k1, %k0 > - kmovq %k0, %rax > - tzcntq %rax, %rax > + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > + */ > + kmovd %k2, %edx > + kmovd %k0, %eax > + salq $CHAR_PER_VEC, %rdx > + orq %rdx, %rax > # endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > > - > -# ifdef USE_AS_STRNLEN > -L(last_vec_x1_check): > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max) > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > + */ > + .p2align 4,, 2 > +L(FALLTHROUGH_RETURN_LBL): > + bsfq %rax, %rax > + subq %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > ret > > - .p2align 4 > -L(last_4x_vec): > - /* Test first 2x VEC normally. */ > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - /* Normalize length. */ > - andl $(CHAR_PER_VEC * 4 - 1), %esi > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - /* Check the end of data. */ > - subl $(CHAR_PER_VEC * 3), %esi > - jb L(max) > - > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max_end) > - > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax > + .p2align 4,, 8 > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + addq %rdi, %rax > ret > > - .p2align 4 > -L(last_vec_x1): > - tzcntl %eax, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > + .p2align 4,, 10 > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > - .p2align 4 > -L(last_vec_x2): > - tzcntl %eax, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(last_vec_x3): > - tzcntl %eax, %eax > - subl $(CHAR_PER_VEC * 2), %esi > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max_end) > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax > - ret > -L(max_end): > - movq %r8, %rax > + .p2align 4,, 10 > + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. > + */ > +L(TAIL_RETURN_LBL): > + bsf %VRAX, %VRAX > + sub %VRCX, %VRDI > + CHAR_SIZE_SHIFT_REG (VRDI) > + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX > ret > -# endif > > - /* Cold case for crossing page with first load. */ > - .p2align 4 > + .p2align 4,, 8 > L(cross_page_boundary): > - movq %rdi, %rdx > + movq %rdi, %rcx > /* Align data to VEC_SIZE. */ > andq $-VEC_SIZE, %rdi > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - /* Remove the leading bytes. */ > + > + VPCMPEQ (%rdi), %VZERO, %k0 > + > + KMOV %k0, %VRAX > # ifdef USE_AS_WCSLEN > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - movl %edx, %ecx > - shrl $2, %ecx > - andl $(CHAR_PER_VEC - 1), %ecx > -# endif > - /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ > - sarxl %SHIFT_REG, %eax, %eax > + movl %ecx, %edx > + shrl $2, %edx > + andl $(CHAR_PER_VEC - 1), %edx > + shrx %edx, %eax, %eax > testl %eax, %eax > -# ifndef USE_AS_STRNLEN > - jz L(cross_page_continue) > - tzcntl %eax, %eax > - ret > # else > - jnz L(cross_page_less_vec) > -# ifndef USE_AS_WCSLEN > - movl %edx, %ecx > - andl $(CHAR_PER_VEC - 1), %ecx > -# endif > - movl $CHAR_PER_VEC, %eax > - subl %ecx, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - ja L(cross_page_continue) > - movl %esi, %eax > - ret > -L(cross_page_less_vec): > - tzcntl %eax, %eax > - /* Select min of length and position of first null. */ > - cmpq %rax, %rsi > - cmovb %esi, %eax > - ret > + shr %cl, %VRAX > # endif > + jz L(cross_page_continue) > + bsf %VRAX, %VRAX > + ret > > END (STRLEN) > #endif > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S > index 64a9fc2606..443a32a749 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S > @@ -1,8 +1,423 @@ > -#ifndef STRNLEN > -# define STRNLEN __strnlen_evex > -#endif > +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > +#include <sysdep.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > + > +# ifndef STRNLEN > +# define STRNLEN __strnlen_evex > +# endif > + > +# ifdef USE_AS_WCSLEN > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNEQ vpcmpneqd > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > +# define VPMINU vpminud > +# define CHAR_SIZE 4 > + > +# else > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNEQ vpcmpneqb > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > +# define VPMINU vpminub > +# define CHAR_SIZE 1 > + > +# define REG_WIDTH VEC_SIZE > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# include "reg-macros.h" > + > +# if CHAR_PER_VEC == 32 > +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > +# else > +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > +# endif > + > + > + > +# if CHAR_PER_VEC == 64 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > +# else > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > +# endif > + > + > +# define XZERO VMM_128(0) > +# define VZERO VMM(0) > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (STRNLEN, 6) > + /* Check zero length. */ > + test %RSI_LP, %RSI_LP > + jz L(zero) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %esi, %esi > +# endif > + > + movl %edi, %eax > + vpxorq %XZERO, %XZERO, %XZERO > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > + > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > + null byte. */ > + VPCMPEQ (%rdi), %VZERO, %k0 > + > + KMOV %k0, %VRCX > + movq %rsi, %rax > + > + /* If src (rcx) is zero, bsf does not change the result. NB: > + Must use 64-bit bsf here so that upper bits of len are not > + cleared. */ > + bsfq %rcx, %rax > + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > + CHAR) and rsi must be > CHAR_PER_VEC. */ > + cmpq $CHAR_PER_VEC, %rax > + ja L(more_1x_vec) > + /* Check if first match in bounds. */ > + cmpq %rax, %rsi > + cmovb %esi, %eax > + ret > + > + > +# if CHAR_PER_VEC != 32 > + .p2align 4,, 2 > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > +# endif > + > + /* Aligned more for strnlen compares remaining length vs 2 * > + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > + going to the loop. */ > + .p2align 4,, 10 > +L(more_1x_vec): > +L(cross_page_continue): > + /* Compute number of words checked after aligning. */ > +# ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + movq %rdi, %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > +# else > + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > +# endif > + > + > + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > + > + cmpq $(CHAR_PER_VEC * 2), %rax > + ja L(more_2x_vec) > + > +L(last_2x_vec_or_less): > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_check) > + > + /* Check the end of data. */ > + SUB_SHORT (CHAR_PER_VEC, rax) > + jbe L(max_0) > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jz L(max_0) > + /* Best place for LAST_VEC_CHECK if ZMM. */ > + .p2align 4,, 8 > +L(last_vec_check): > + bsf %VRDX, %VRDX > + sub %eax, %edx > + lea (%rsi, %rdx), %eax > + cmovae %esi, %eax > + ret > + > +# if CHAR_PER_VEC == 32 > + .p2align 4,, 2 > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > +# endif > + > + .p2align 4,, 8 > +L(last_4x_vec_or_less): > + addl $(CHAR_PER_VEC * -4), %eax > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > + subq $(VEC_SIZE * -4), %rdi > + cmpl $(CHAR_PER_VEC * 2), %eax > + jbe L(last_2x_vec_or_less) > + > + .p2align 4,, 6 > +L(more_2x_vec): > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > > -#define USE_AS_STRNLEN 1 > -#define STRLEN STRNLEN > + KMOV %k0, %VRDX > > -#include "strlen-evex.S" > + test %VRDX, %VRDX > + jnz L(first_vec_x1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x2) > + > + cmpq $(CHAR_PER_VEC * 4), %rax > + ja L(more_4x_vec) > + > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + addl $(CHAR_PER_VEC * -2), %eax > + test %VRDX, %VRDX > + jnz L(last_vec_check) > + > + subl $(CHAR_PER_VEC), %eax > + jbe L(max_1) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +L(max_1): > + movl %esi, %eax > + ret > + > + .p2align 4,, 3 > +L(first_vec_x2): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > + ret > + .p2align 4,, 6 > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > + ret > + > + > + .p2align 4,, 6 > +L(first_vec_x4): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > + ret > + .p2align 4,, 6 > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x3): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > + ret > + > + .p2align 4,, 5 > +L(more_4x_vec): > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x4) > + > + /* Check if at last VEC_SIZE * 4 length before aligning for the > + loop. */ > + cmpq $(CHAR_PER_VEC * 8), %rax > + jbe L(last_4x_vec_or_less) > + > + > + /* Compute number of words checked after aligning. */ > +# ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + leaq (VEC_SIZE * -3)(%rdi), %rdx > +# else > + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > +# endif > + > + subq $(VEC_SIZE * -1), %rdi > + > + /* Align data to VEC_SIZE * 4. */ > +# if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# ifdef USE_AS_WCSLEN > + subq %rdi, %rdx > + sarq $2, %rdx > + addq %rdx, %rax > +# else > + subq %rdi, %rax > +# endif > + /* Compare 4 * VEC at a time forward. */ > + .p2align 4,, 11 > +L(loop_4x_vec): > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k2 > + subq $-(VEC_SIZE * 4), %rdi > + /* Break if at end of length. */ > + subq $(CHAR_PER_VEC * 4), %rax > + jbe L(loop_len_end) > + > + > + KORTEST %k0, %k2 > + jz L(loop_4x_vec) > + > + > +L(loop_last_4x_vec): > + movq %rsi, %rcx > + subq %rax, %rsi > + VPTESTN %VMM(1), %VMM(1), %k1 > + KMOV %k1, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x0) > + > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x1) > + > + VPTESTN %VMM(3), %VMM(3), %k0 > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if CHAR_PER_VEC == 64 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x2) > + KMOV %k2, %VRDX > +# else > + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > + */ > + kmovd %k2, %edx > + kmovd %k0, %eax > + salq $CHAR_PER_VEC, %rdx > + orq %rax, %rdx > +# endif > + > + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > + */ > + bsfq %rdx, %rdx > + leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > + > + /* Handle last 4x VEC after loop. All VECs have been loaded. */ > + .p2align 4,, 4 > +L(loop_len_end): > + KORTEST %k0, %k2 > + jnz L(loop_last_4x_vec) > + movq %rsi, %rax > + ret > + > + > +# if CHAR_PER_VEC == 64 > + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 > + need return label for it. */ > + .p2align 4,, 8 > +L(last_vec_x2): > + bsf %VRDX, %VRDX > + leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > +# endif > + > + > + .p2align 4,, 10 > +L(last_vec_x1): > + addq $CHAR_PER_VEC, %rsi > +L(last_vec_x0): > + bsf %VRDX, %VRDX > + leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > + > + > + .p2align 4,, 8 > +L(cross_page_boundary): > + /* Align data to VEC_SIZE. */ > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + VPCMPEQ (%rcx), %VZERO, %k0 > + > + KMOV %k0, %VRCX > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > + andl $(CHAR_PER_VEC - 1), %eax > +# endif > + shrx %VRAX, %VRCX, %VRCX > + > + negl %eax > + andl $(CHAR_PER_VEC - 1), %eax > + movq %rsi, %rdx > + bsf %VRCX, %VRDX > + cmpq %rax, %rdx > + ja L(cross_page_continue) > + movl %edx, %eax > + cmpq %rdx, %rsi > + cmovb %esi, %eax > + ret > +END (STRNLEN) > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S > index e2aad94c1e..57a7e93fbf 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S > @@ -2,8 +2,7 @@ > # define WCSNLEN __wcsnlen_evex > #endif > > -#define STRLEN WCSNLEN > +#define STRNLEN WCSNLEN > #define USE_AS_WCSLEN 1 > -#define USE_AS_STRNLEN 1 > > -#include "strlen-evex.S" > +#include "strnlen-evex.S" > -- > 2.34.1 > LGTM. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 2109ec2f7a..487846f098 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -26,466 +26,220 @@ # define STRLEN __strlen_evex # endif -# define VMOVA vmovdqa64 +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif # ifdef USE_AS_WCSLEN -# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd +# define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud -# define SHIFT_REG ecx # define CHAR_SIZE 4 +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg # else -# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb +# define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub -# define SHIFT_REG edx # define CHAR_SIZE 1 +# define CHAR_SIZE_SHIFT_REG(reg) + +# define REG_WIDTH VEC_SIZE # endif -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 - -# define VEC_SIZE 32 -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section .text.evex,"ax",@progbits -ENTRY (STRLEN) -# ifdef USE_AS_STRNLEN - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(zero) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - mov %RSI_LP, %R8_LP +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# include "reg-macros.h" + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) # endif + +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRLEN, 6) movl %edi, %eax - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Clear high bits from edi. Only keeping bits relevant to page - cross check. */ + vpxorq %XZERO, %XZERO, %XZERO andl $(PAGE_SIZE - 1), %eax - /* Check if we may cross page boundary with one vector load. */ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Each bit in K0 represents a null byte. */ - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax -# ifdef USE_AS_STRNLEN - /* If length < CHAR_PER_VEC handle special. */ - cmpq $CHAR_PER_VEC, %rsi - jbe L(first_vec_x0) -# endif - testl %eax, %eax + VPCMPEQ (%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jz L(aligned_more) - tzcntl %eax, %eax - ret -# ifdef USE_AS_STRNLEN -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(first_vec_x0): - /* Set bit for max len so that tzcnt will return min of max len - and position of first match. */ - btsq %rsi, %rax - tzcntl %eax, %eax - ret -# endif - - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal CHAR_PER_VEC(%rdi, %rax), %eax -# endif - ret - - .p2align 4 -L(first_vec_x2): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax -# endif + bsf %VRAX, %VRAX ret - .p2align 4 -L(first_vec_x3): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax -# endif - ret - - .p2align 4 + .p2align 4,, 8 L(first_vec_x4): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif + bsf %VRAX, %VRAX + subl %ecx, %edi + CHAR_SIZE_SHIFT_REG (edi) leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax -# endif ret - .p2align 5 + + + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 L(aligned_more): - movq %rdi, %rdx - /* Align data to VEC_SIZE. */ - andq $-(VEC_SIZE), %rdi + movq %rdi, %rcx + andq $(VEC_SIZE * -1), %rdi L(cross_page_continue): - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ -# ifdef USE_AS_STRNLEN - /* + CHAR_SIZE because it simplies the logic in - last_4x_vec_or_less. */ - leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx - subq %rdx, %rcx -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %ecx -# endif -# endif - /* Load first VEC regardless. */ - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 -# ifdef USE_AS_STRNLEN - /* Adjust length. If near end handle specially. */ - subq %rcx, %rsi - jb L(last_4x_vec_or_less) -# endif - kmovd %k0, %eax - testl %eax, %eax + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x1) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - test %eax, %eax + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x2) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x3) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x4) - addq $VEC_SIZE, %rdi -# ifdef USE_AS_STRNLEN - /* Check if at last VEC_SIZE * 4 length. */ - cmpq $(CHAR_PER_VEC * 4 - 1), %rsi - jbe L(last_4x_vec_or_less_load) - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %ecx -# endif - /* Readjust length. */ - addq %rcx, %rsi -# endif - /* Align data to VEC_SIZE * 4. */ + subq $(VEC_SIZE * -1), %rdi + +# if CHAR_PER_VEC == 64 + /* No partial register stalls on processors that we use evex512 + on and this saves code size. */ + xorb %dil, %dil +# else andq $-(VEC_SIZE * 4), %rdi +# endif + + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Load first VEC regardless. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 -# ifdef USE_AS_STRNLEN - /* Break if at end of length. */ - subq $(CHAR_PER_VEC * 4), %rsi - jb L(last_4x_vec_or_less_cmpeq) -# endif - /* Save some code size by microfusing VPMINU with the load. Since - the matches in ymm2/ymm4 can only be returned if there where no - matches in ymm1/ymm3 respectively there is no issue with overlap. - */ - VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 - VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k2 - VPCMP $0, %YMM2, %YMMZERO, %k0 - VPCMP $0, %YMM4, %YMMZERO, %k1 subq $-(VEC_SIZE * 4), %rdi - kortestd %k0, %k1 + KORTEST %k0, %k2 jz L(loop_4x_vec) - /* Check if end was in first half. */ - kmovd %k0, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - shrq $2, %rdi -# endif - testl %eax, %eax - jz L(second_vec_return) + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) - VPCMP $0, %YMM1, %YMMZERO, %k2 - kmovd %k2, %edx - /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ -# ifdef USE_AS_WCSLEN - sall $CHAR_PER_VEC, %eax - orl %edx, %eax - tzcntl %eax, %eax -# else - salq $CHAR_PER_VEC, %rax - orq %rdx, %rax - tzcntq %rax, %rax -# endif - addq %rdi, %rax - ret - - -# ifdef USE_AS_STRNLEN - -L(last_4x_vec_or_less_load): - /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 -L(last_4x_vec_or_less_cmpeq): - VPCMP $0, %YMM1, %YMMZERO, %k0 - addq $(VEC_SIZE * 3), %rdi -L(last_4x_vec_or_less): - kmovd %k0, %eax - /* If remaining length > VEC_SIZE * 2. This works if esi is off by - VEC_SIZE * 4. */ - testl $(CHAR_PER_VEC * 2), %esi - jnz L(last_4x_vec) - - /* length may have been negative or positive by an offset of - CHAR_PER_VEC * 4 depending on where this was called from. This - fixes that. */ - andl $(CHAR_PER_VEC * 4 - 1), %esi - testl %eax, %eax - jnz L(last_vec_x1_check) + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) - /* Check the end of data. */ - subl $CHAR_PER_VEC, %esi - jb L(max) + VPTESTN %VMM(3), %VMM(3), %k0 - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max) - - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret -L(max): - movq %r8, %rax - ret -# endif - - /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) - in the 4x VEC loop can use 2 byte encoding. */ - .p2align 4 -L(second_vec_return): - VPCMP $0, %YMM3, %YMMZERO, %k0 - /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ -# ifdef USE_AS_WCSLEN - kunpckbw %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k2, %VRAX # else - kunpckdq %k0, %k1, %k0 - kmovq %k0, %rax - tzcntq %rax, %rax + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. + */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rdx, %rax # endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret - -# ifdef USE_AS_STRNLEN -L(last_vec_x1_check): - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max) - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. + */ + .p2align 4,, 2 +L(FALLTHROUGH_RETURN_LBL): + bsfq %rax, %rax + subq %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax ret - .p2align 4 -L(last_4x_vec): - /* Test first 2x VEC normally. */ - testl %eax, %eax - jnz L(last_vec_x1) - - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - /* Normalize length. */ - andl $(CHAR_PER_VEC * 4 - 1), %esi - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - /* Check the end of data. */ - subl $(CHAR_PER_VEC * 3), %esi - jb L(max) - - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max_end) - - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax + .p2align 4,, 8 +L(first_vec_x0): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + addq %rdi, %rax ret - .p2align 4 -L(last_vec_x1): - tzcntl %eax, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif + .p2align 4,, 10 +L(first_vec_x1): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret - .p2align 4 -L(last_vec_x2): - tzcntl %eax, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret - - .p2align 4 -L(last_vec_x3): - tzcntl %eax, %eax - subl $(CHAR_PER_VEC * 2), %esi - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max_end) - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax - ret -L(max_end): - movq %r8, %rax + .p2align 4,, 10 + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. + */ +L(TAIL_RETURN_LBL): + bsf %VRAX, %VRAX + sub %VRCX, %VRDI + CHAR_SIZE_SHIFT_REG (VRDI) + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX ret -# endif - /* Cold case for crossing page with first load. */ - .p2align 4 + .p2align 4,, 8 L(cross_page_boundary): - movq %rdi, %rdx + movq %rdi, %rcx /* Align data to VEC_SIZE. */ andq $-VEC_SIZE, %rdi - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - /* Remove the leading bytes. */ + + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRAX # ifdef USE_AS_WCSLEN - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - movl %edx, %ecx - shrl $2, %ecx - andl $(CHAR_PER_VEC - 1), %ecx -# endif - /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ - sarxl %SHIFT_REG, %eax, %eax + movl %ecx, %edx + shrl $2, %edx + andl $(CHAR_PER_VEC - 1), %edx + shrx %edx, %eax, %eax testl %eax, %eax -# ifndef USE_AS_STRNLEN - jz L(cross_page_continue) - tzcntl %eax, %eax - ret # else - jnz L(cross_page_less_vec) -# ifndef USE_AS_WCSLEN - movl %edx, %ecx - andl $(CHAR_PER_VEC - 1), %ecx -# endif - movl $CHAR_PER_VEC, %eax - subl %ecx, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - ja L(cross_page_continue) - movl %esi, %eax - ret -L(cross_page_less_vec): - tzcntl %eax, %eax - /* Select min of length and position of first null. */ - cmpq %rax, %rsi - cmovb %esi, %eax - ret + shr %cl, %VRAX # endif + jz L(cross_page_continue) + bsf %VRAX, %VRAX + ret END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S index 64a9fc2606..443a32a749 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -1,8 +1,423 @@ -#ifndef STRNLEN -# define STRNLEN __strnlen_evex -#endif +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> +#include <sysdep.h> + +#if ISA_SHOULD_BUILD (4) + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + + +# ifndef STRNLEN +# define STRNLEN __strnlen_evex +# endif + +# ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPMINU vpminud +# define CHAR_SIZE 4 + +# else +# define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPMINU vpminub +# define CHAR_SIZE 1 + +# define REG_WIDTH VEC_SIZE +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# include "reg-macros.h" + +# if CHAR_PER_VEC == 32 +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) +# else +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) +# endif + + + +# if CHAR_PER_VEC == 64 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) +# else +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) +# endif + + +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRNLEN, 6) + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + + movl %edi, %eax + vpxorq %XZERO, %XZERO, %XZERO + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRCX + movq %rsi, %rax + + /* If src (rcx) is zero, bsf does not change the result. NB: + Must use 64-bit bsf here so that upper bits of len are not + cleared. */ + bsfq %rcx, %rax + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null + CHAR) and rsi must be > CHAR_PER_VEC. */ + cmpq $CHAR_PER_VEC, %rax + ja L(more_1x_vec) + /* Check if first match in bounds. */ + cmpq %rax, %rsi + cmovb %esi, %eax + ret + + +# if CHAR_PER_VEC != 32 + .p2align 4,, 2 +L(zero): +L(max_0): + movl %esi, %eax + ret +# endif + + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 +L(more_1x_vec): +L(cross_page_continue): + /* Compute number of words checked after aligning. */ +# ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + movq %rdi, %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax +# else + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax +# endif + + + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 + + cmpq $(CHAR_PER_VEC * 2), %rax + ja L(more_2x_vec) + +L(last_2x_vec_or_less): + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) + + /* Check the end of data. */ + SUB_SHORT (CHAR_PER_VEC, rax) + jbe L(max_0) + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jz L(max_0) + /* Best place for LAST_VEC_CHECK if ZMM. */ + .p2align 4,, 8 +L(last_vec_check): + bsf %VRDX, %VRDX + sub %eax, %edx + lea (%rsi, %rdx), %eax + cmovae %esi, %eax + ret + +# if CHAR_PER_VEC == 32 + .p2align 4,, 2 +L(zero): +L(max_0): + movl %esi, %eax + ret +# endif + + .p2align 4,, 8 +L(last_4x_vec_or_less): + addl $(CHAR_PER_VEC * -4), %eax + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 + subq $(VEC_SIZE * -4), %rdi + cmpl $(CHAR_PER_VEC * 2), %eax + jbe L(last_2x_vec_or_less) + + .p2align 4,, 6 +L(more_2x_vec): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ -#define USE_AS_STRNLEN 1 -#define STRLEN STRNLEN + KMOV %k0, %VRDX -#include "strlen-evex.S" + test %VRDX, %VRDX + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x2) + + cmpq $(CHAR_PER_VEC * 4), %rax + ja L(more_4x_vec) + + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + addl $(CHAR_PER_VEC * -2), %eax + test %VRDX, %VRDX + jnz L(last_vec_check) + + subl $(CHAR_PER_VEC), %eax + jbe L(max_1) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(last_vec_check) +L(max_1): + movl %esi, %eax + ret + + .p2align 4,, 3 +L(first_vec_x2): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax + ret + .p2align 4,, 6 +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x1): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax + ret + + + .p2align 4,, 6 +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + .p2align 4,, 6 +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret + + .p2align 4,, 5 +L(more_4x_vec): + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x4) + + /* Check if at last VEC_SIZE * 4 length before aligning for the + loop. */ + cmpq $(CHAR_PER_VEC * 8), %rax + jbe L(last_4x_vec_or_less) + + + /* Compute number of words checked after aligning. */ +# ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + leaq (VEC_SIZE * -3)(%rdi), %rdx +# else + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax +# endif + + subq $(VEC_SIZE * -1), %rdi + + /* Align data to VEC_SIZE * 4. */ +# if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# ifdef USE_AS_WCSLEN + subq %rdi, %rdx + sarq $2, %rdx + addq %rdx, %rax +# else + subq %rdi, %rax +# endif + /* Compare 4 * VEC at a time forward. */ + .p2align 4,, 11 +L(loop_4x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k2 + subq $-(VEC_SIZE * 4), %rdi + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rax + jbe L(loop_len_end) + + + KORTEST %k0, %k2 + jz L(loop_4x_vec) + + +L(loop_last_4x_vec): + movq %rsi, %rcx + subq %rax, %rsi + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x0) + + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x1) + + VPTESTN %VMM(3), %VMM(3), %k0 + + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC + individually, for VEC_SIZE == 32 we combine them in a single + 64-bit GPR. */ +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x2) + KMOV %k2, %VRDX +# else + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. + */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rax, %rdx +# endif + + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. + */ + bsfq %rdx, %rdx + leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret + + /* Handle last 4x VEC after loop. All VECs have been loaded. */ + .p2align 4,, 4 +L(loop_len_end): + KORTEST %k0, %k2 + jnz L(loop_last_4x_vec) + movq %rsi, %rax + ret + + +# if CHAR_PER_VEC == 64 + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 + need return label for it. */ + .p2align 4,, 8 +L(last_vec_x2): + bsf %VRDX, %VRDX + leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret +# endif + + + .p2align 4,, 10 +L(last_vec_x1): + addq $CHAR_PER_VEC, %rsi +L(last_vec_x0): + bsf %VRDX, %VRDX + leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret + + + .p2align 4,, 8 +L(cross_page_boundary): + /* Align data to VEC_SIZE. */ + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + VPCMPEQ (%rcx), %VZERO, %k0 + + KMOV %k0, %VRCX +# ifdef USE_AS_WCSLEN + shrl $2, %eax + andl $(CHAR_PER_VEC - 1), %eax +# endif + shrx %VRAX, %VRCX, %VRCX + + negl %eax + andl $(CHAR_PER_VEC - 1), %eax + movq %rsi, %rdx + bsf %VRCX, %VRDX + cmpq %rax, %rdx + ja L(cross_page_continue) + movl %edx, %eax + cmpq %rdx, %rsi + cmovb %esi, %eax + ret +END (STRNLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S index e2aad94c1e..57a7e93fbf 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S @@ -2,8 +2,7 @@ # define WCSNLEN __wcsnlen_evex #endif -#define STRLEN WCSNLEN +#define STRNLEN WCSNLEN #define USE_AS_WCSLEN 1 -#define USE_AS_STRNLEN 1 -#include "strlen-evex.S" +#include "strnlen-evex.S"