Message ID | 20240809220509.3631574-1-matthew.sterrett@intel.com |
---|---|
State | New |
Headers | show |
Series | x86: Unifies 'strnlen-evex' and 'strnlen-evex512' implementations. | expand |
On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett <matthew.sterrett@intel.com> wrote: > > This commit uses a common implementation 'strnlen-evex-base.S' for both > 'strnlen-evex' and 'strnlen-evex512' > > This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512. > > All tests pass on x86. > > Benchmarks were taken on SKX. > https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html > > Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881 > Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953 > > Code Size Changes: > strnlen-evex : +31 bytes > strnlen-evex512 : +156 bytes > --- > sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++ > sysdeps/x86_64/multiarch/strnlen-evex.S | 428 +---------------- > sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 +---------- > 3 files changed, 469 insertions(+), 680 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S > new file mode 100644 > index 0000000000..1c2cfdfe06 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S > @@ -0,0 +1,462 @@ > +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions. > + Copyright (C) 2022-2024 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +#ifdef USE_AS_WCSLEN > +# define VPCMPEQ vpcmpeqd > +# define VPTESTN vptestnmd > +# define VPMINU vpminud > +# define CHAR_SIZE 4 > +#else > +# define VPCMPEQ vpcmpeqb > +# define VPTESTN vptestnmb > +# define VPMINU vpminub > +# define CHAR_SIZE 1 > +#endif > + > +#define XZERO VMM_128(0) > +#define VZERO VMM(0) > +#define PAGE_SIZE 4096 > +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +#if CHAR_PER_VEC == 32 > +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > +#else > +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > +#endif > + > +#ifdef USE_AS_WCSLEN > +/* For wide-character, we care more about limitting code size > + than optimally aligning targets, so just cap nop padding > + reasonably low. */ > +# define P2ALIGN(...) .p2align 4,, 6 > +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__) > +#else > +# define P2ALIGN(x) .p2align x > +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y > +#endif > + > + .section SECTION(.text), "ax", @progbits > + /* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN(STRNLEN, 6) > + /* rdi is pointer to array, rsi is the upper limit. */ > + > + /* Check zero length. */ > + test %RSI_LP, %RSI_LP > + jz L(zero) > + > +#ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %esi, %esi > +#endif > + > + vpxorq %XZERO, %XZERO, %XZERO > + > + /* Check that we won't cross a page boundary with our first load. */ > + movl %edi, %eax > + shll $20, %eax > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > + ja L(crosses_page_boundary) > + > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > + null byte. */ > + VPCMPEQ (%rdi), %VZERO, %k0 > + KMOV %k0, %VRCX > + > + /* If src (rcx) is zero, bsf does not change the result. NB: > + Must use 64-bit bsf here so that upper bits of len are not > + cleared. */ > + movq %rsi, %rax > + bsfq %rcx, %rax > + > + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > + CHAR) and rsi must be > CHAR_PER_VEC. */ > + cmpq $CHAR_PER_VEC, %rax > + ja L(more_1x_vec) > + > + /* Check if first match in bounds. */ > + cmpq %rax, %rsi > + cmovb %esi, %eax > + ret > + > +#if VEC_SIZE == 32 > + P2ALIGN_CLAMPED(4, 2) > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > +#endif > + > + P2ALIGN_CLAMPED(4, 10) > +L(more_1x_vec): > +L(cross_page_continue): > + /* After this calculation, rax stores the number of elements > + left to be processed The complexity comes from the fact some > + elements get read twice due to alignment and we need to be > + sure we don't count them twice (else, it would just be rsi - > + CHAR_PER_VEC). */ > + > +#ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + movq %rdi, %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > +#else > + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits, > + then subtract ptr to get the new aligned limit value. */ > + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > +#endif > + > + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > + > + /* Checking here is faster for 256-bit but not 512-bit */ > +#if VEC_SIZE == 0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +#endif > + > + cmpq $(CHAR_PER_VEC * 2), %rax > + ja L(more_2x_vec) > + > +L(last_2x_vec_or_less): > + > + /* Checking here is faster for 512-bit but not 256-bit */ > +#if VEC_SIZE != 0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +#endif > + > + /* Check for the end of data. */ > + SUB_SHORT (CHAR_PER_VEC, rax) > + jbe L(max_0) > + > + /* Check the final remaining vector. */ > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > +#if VEC_SIZE == 32 > + jz L(max_0) > +#else > + jnz L(last_vec_check) > + P2ALIGN_CLAMPED(4, 2) > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > + > +#endif > + P2ALIGN_CLAMPED(4, 4) > +L(last_vec_check): > + bsf %VRDX, %VRDX > + sub %eax, %edx > + lea (%rsi, %rdx), %eax > + cmovae %esi, %eax > + ret > + > + > +#if VEC_SIZE == 32 > + P2ALIGN_CLAMPED(4, 8) > +#endif > +L(last_4x_vec_or_less): > + addl $(CHAR_PER_VEC * -4), %eax > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > + > +#if VEC_SIZE == 64 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +#endif > + > + subq $(VEC_SIZE * -4), %rdi > + cmpl $(CHAR_PER_VEC * 2), %eax > + jbe L(last_2x_vec_or_less) > + > + P2ALIGN_CLAMPED(4, 6) > +L(more_2x_vec): > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > + > + /* Already checked in 256-bit case */ > +#if VEC_SIZE != 0 > + KMOV %k0, %VRDX > + > + test %VRDX, %VRDX > + jnz L(first_vec_x1) > +#endif > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + > + test %VRDX, %VRDX > + jnz L(first_vec_x2) > + > + cmpq $(CHAR_PER_VEC * 4), %rax > + ja L(more_4x_vec) > + > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + addl $(CHAR_PER_VEC * -2), %eax > + test %VRDX, %VRDX > + jnz L(last_vec_check) > + > + subb $(CHAR_PER_VEC), %al > + jbe L(max_1) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +L(max_1): > + movl %esi, %eax > + ret > + > + > + P2ALIGN_CLAMPED(4, 14) > +L(first_vec_x2): > +#if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > + ret > + P2ALIGN_CLAMPED(4, 6) > +#else > + addl $CHAR_PER_VEC, %esi > +#endif > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > + ret > + > +#if VEC_SIZE == 64 > + P2ALIGN_CLAMPED(4, 6) > +L(first_vec_x4): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > + ret > + P2ALIGN_CLAMPED(4, 6) > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x3): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > + ret > +#endif > + > + P2ALIGN_CLAMPED(6, 20) > +L(more_4x_vec): > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x4) > + > + /* Check if at last VEC_SIZE * 4 length before aligning for the > + loop. */ > + cmpq $(CHAR_PER_VEC * 8), %rax > + jbe L(last_4x_vec_or_less) > + > + > + /* Compute number of words checked after aligning. */ > +#ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + leaq (VEC_SIZE * -3)(%rdi), %rdx > +#else > + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > +#endif > + > + subq $(VEC_SIZE * -1), %rdi > + > + /* Align data to VEC_SIZE * 4. */ > +#if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +#else > + andq $-(VEC_SIZE * 4), %rdi > +#endif > + > +#ifdef USE_AS_WCSLEN > + subq %rdi, %rdx > + sarq $2, %rdx > + addq %rdx, %rax > +#else > + subq %rdi, %rax > +#endif > + > + // mov %rdi, %rdx > + > + P2ALIGN(6) > +L(loop): > + /* VPMINU and VPCMP combination provide better performance as > + compared to alternative combinations. */ > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k1 > + > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k0, %k1 > + > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rax > + ja L(loop) > + mov %rsi, %rax > + ret > + > + > +#if VEC_SIZE == 32 > + P2ALIGN_CLAMPED(4, 6) > +L(first_vec_x4): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > + ret > + P2ALIGN_CLAMPED(4, 6) > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x3): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > + ret > +#endif > + > + > + P2ALIGN_CLAMPED(4, 11) > +L(loopend): > + /* We found a null terminator in one of the 4 vectors. */ > + > + /* Check the first vector. */ > + movq %rax, %r8 > + VPTESTN %VMM(1), %VMM(1), %k2 > + KMOV %k2, %VRCX > + bsf %rcx, %r8 > + > + cmpq $(CHAR_PER_VEC), %r8 > + jbe L(end_vec) > + > + /* Check the second vector. */ > + subq $(CHAR_PER_VEC), %rax > + movq %rax, %r8 > + KMOV %k0, %VRCX > + bsf %rcx, %r8 > + > + cmpq $(CHAR_PER_VEC), %r8 > + jbe L(end_vec) > + > + /* Check the third vector. */ > + subq $(CHAR_PER_VEC), %rax > + movq %rax, %r8 > + VPTESTN %VMM(3), %VMM(3), %k2 > + KMOV %k2, %VRCX > + bsf %rcx, %r8 > + > + cmpq $(CHAR_PER_VEC), %r8 > + jbe L(end_vec) > + > + /* It is in the fourth vector. */ > + subq $(CHAR_PER_VEC), %rax > + movq %rax, %r8 > + KMOV %k1, %VRCX > + bsf %rcx, %r8 > + > + P2ALIGN_CLAMPED(4, 3) > +L(end_vec): > + /* Get the number that has been processed. */ > + movq %rsi, %rcx > + subq %rax, %rcx > + > + /* Add that to the offset we found the null terminator at. */ > + leaq (%r8, %rcx), %rax > + > + /* Take the min of that and the limit. */ > + cmpq %rsi, %rax > + cmovnb %rsi, %rax > + ret > + > + P2ALIGN_CLAMPED(4, 11) > +L(crosses_page_boundary): > + /* Align data backwards to VEC_SIZE. */ > + shrl $20, %eax > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + VPCMPEQ (%rcx), %VZERO, %k0 > + > + KMOV %k0, %VRCX > +#ifdef USE_AS_WCSLEN > + shrl $2, %eax > + andl $(CHAR_PER_VEC - 1), %eax > +#endif > + /* By this point rax contains number of bytes we need to skip. */ > + shrx %VRAX, %VRCX, %VRCX > + > + /* Calculates CHAR_PER_VEC - eax and stores in eax. */ > + negl %eax > + andl $(CHAR_PER_VEC - 1), %eax > + > + movq %rsi, %rdx > + bsf %VRCX, %VRDX > + cmpq %rax, %rdx > + ja L(cross_page_continue) > + > + /* The vector had a null terminator or we are at the limit. */ > + movl %edx, %eax > + cmpq %rdx, %rsi > + cmovb %esi, %eax > + ret > + > +END(STRNLEN) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S > index 91b16830eb..cba585452c 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S > @@ -1,423 +1,7 @@ > -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. > - Copyright (C) 2022-2024 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <isa-level.h> > -#include <sysdep.h> > - > -#if ISA_SHOULD_BUILD (4) > - > -# ifndef VEC_SIZE > -# include "x86-evex256-vecs.h" > -# endif > - > - > -# ifndef STRNLEN > -# define STRNLEN __strnlen_evex > -# endif > - > -# ifdef USE_AS_WCSLEN > -# define VPCMPEQ vpcmpeqd > -# define VPCMPNEQ vpcmpneqd > -# define VPTESTN vptestnmd > -# define VPTEST vptestmd > -# define VPMINU vpminud > -# define CHAR_SIZE 4 > - > -# else > -# define VPCMPEQ vpcmpeqb > -# define VPCMPNEQ vpcmpneqb > -# define VPTESTN vptestnmb > -# define VPTEST vptestmb > -# define VPMINU vpminub > -# define CHAR_SIZE 1 > - > -# define REG_WIDTH VEC_SIZE > -# endif > - > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > -# include "reg-macros.h" > - > -# if CHAR_PER_VEC == 32 > -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > -# else > -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > -# endif > - > - > - > -# if CHAR_PER_VEC == 64 > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > -# else > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > -# endif > - > - > -# define XZERO VMM_128(0) > -# define VZERO VMM(0) > -# define PAGE_SIZE 4096 > - > - .section SECTION(.text), "ax", @progbits > -ENTRY_P2ALIGN (STRNLEN, 6) > - /* Check zero length. */ > - test %RSI_LP, %RSI_LP > - jz L(zero) > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %esi, %esi > -# endif > - > - movl %edi, %eax > - vpxorq %XZERO, %XZERO, %XZERO > - andl $(PAGE_SIZE - 1), %eax > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page_boundary) > - > - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > - null byte. */ > - VPCMPEQ (%rdi), %VZERO, %k0 > - > - KMOV %k0, %VRCX > - movq %rsi, %rax > - > - /* If src (rcx) is zero, bsf does not change the result. NB: > - Must use 64-bit bsf here so that upper bits of len are not > - cleared. */ > - bsfq %rcx, %rax > - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > - CHAR) and rsi must be > CHAR_PER_VEC. */ > - cmpq $CHAR_PER_VEC, %rax > - ja L(more_1x_vec) > - /* Check if first match in bounds. */ > - cmpq %rax, %rsi > - cmovb %esi, %eax > - ret > - > - > -# if CHAR_PER_VEC != 32 > - .p2align 4,, 2 > -L(zero): > -L(max_0): > - movl %esi, %eax > - ret > -# endif > - > - /* Aligned more for strnlen compares remaining length vs 2 * > - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > - going to the loop. */ > - .p2align 4,, 10 > -L(more_1x_vec): > -L(cross_page_continue): > - /* Compute number of words checked after aligning. */ > -# ifdef USE_AS_WCSLEN > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > - overflow. */ > - movq %rdi, %rax > - andq $(VEC_SIZE * -1), %rdi > - subq %rdi, %rax > - sarq $2, %rax > - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > -# else > - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > - andq $(VEC_SIZE * -1), %rdi > - subq %rdi, %rax > -# endif > - > - > - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > - > - cmpq $(CHAR_PER_VEC * 2), %rax > - ja L(more_2x_vec) > - > -L(last_2x_vec_or_less): > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(last_vec_check) > - > - /* Check the end of data. */ > - SUB_SHORT (CHAR_PER_VEC, rax) > - jbe L(max_0) > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jz L(max_0) > - /* Best place for LAST_VEC_CHECK if ZMM. */ > - .p2align 4,, 8 > -L(last_vec_check): > - bsf %VRDX, %VRDX > - sub %eax, %edx > - lea (%rsi, %rdx), %eax > - cmovae %esi, %eax > - ret > - > -# if CHAR_PER_VEC == 32 > - .p2align 4,, 2 > -L(zero): > -L(max_0): > - movl %esi, %eax > - ret > -# endif > - > - .p2align 4,, 8 > -L(last_4x_vec_or_less): > - addl $(CHAR_PER_VEC * -4), %eax > - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > - subq $(VEC_SIZE * -4), %rdi > - cmpl $(CHAR_PER_VEC * 2), %eax > - jbe L(last_2x_vec_or_less) > - > - .p2align 4,, 6 > -L(more_2x_vec): > - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > - rechecking bounds. */ > - > - KMOV %k0, %VRDX > - > - test %VRDX, %VRDX > - jnz L(first_vec_x1) > - > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(first_vec_x2) > - > - cmpq $(CHAR_PER_VEC * 4), %rax > - ja L(more_4x_vec) > - > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - addl $(CHAR_PER_VEC * -2), %eax > - test %VRDX, %VRDX > - jnz L(last_vec_check) > - > - subl $(CHAR_PER_VEC), %eax > - jbe L(max_1) > - > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - > - test %VRDX, %VRDX > - jnz L(last_vec_check) > -L(max_1): > - movl %esi, %eax > - ret > - > - .p2align 4,, 3 > -L(first_vec_x2): > -# if VEC_SIZE == 64 > - /* If VEC_SIZE == 64 we can fit logic for full return label in > - spare bytes before next cache line. */ > - bsf %VRDX, %VRDX > - sub %eax, %esi > - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > - ret > - .p2align 4,, 6 > -# else > - addl $CHAR_PER_VEC, %esi > -# endif > -L(first_vec_x1): > - bsf %VRDX, %VRDX > - sub %eax, %esi > - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > - ret > - > - > - .p2align 4,, 6 > -L(first_vec_x4): > -# if VEC_SIZE == 64 > - /* If VEC_SIZE == 64 we can fit logic for full return label in > - spare bytes before next cache line. */ > - bsf %VRDX, %VRDX > - sub %eax, %esi > - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > - ret > - .p2align 4,, 6 > -# else > - addl $CHAR_PER_VEC, %esi > -# endif > -L(first_vec_x3): > - bsf %VRDX, %VRDX > - sub %eax, %esi > - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > - ret > - > - .p2align 4,, 5 > -L(more_4x_vec): > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(first_vec_x3) > - > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(first_vec_x4) > - > - /* Check if at last VEC_SIZE * 4 length before aligning for the > - loop. */ > - cmpq $(CHAR_PER_VEC * 8), %rax > - jbe L(last_4x_vec_or_less) > - > - > - /* Compute number of words checked after aligning. */ > -# ifdef USE_AS_WCSLEN > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > - overflow. */ > - leaq (VEC_SIZE * -3)(%rdi), %rdx > -# else > - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > -# endif > - > - subq $(VEC_SIZE * -1), %rdi > - > - /* Align data to VEC_SIZE * 4. */ > -# if VEC_SIZE == 64 > - /* Saves code size. No evex512 processor has partial register > - stalls. If that change this can be replaced with `andq > - $-(VEC_SIZE * 4), %rdi`. */ > - xorb %dil, %dil > -# else > - andq $-(VEC_SIZE * 4), %rdi > -# endif > - > -# ifdef USE_AS_WCSLEN > - subq %rdi, %rdx > - sarq $2, %rdx > - addq %rdx, %rax > -# else > - subq %rdi, %rax > -# endif > - /* Compare 4 * VEC at a time forward. */ > - .p2align 4,, 11 > -L(loop_4x_vec): > - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > - VPTESTN %VMM(2), %VMM(2), %k0 > - VPTESTN %VMM(4), %VMM(4), %k2 > - subq $-(VEC_SIZE * 4), %rdi > - /* Break if at end of length. */ > - subq $(CHAR_PER_VEC * 4), %rax > - jbe L(loop_len_end) > - > - > - KORTEST %k0, %k2 > - jz L(loop_4x_vec) > - > - > -L(loop_last_4x_vec): > - movq %rsi, %rcx > - subq %rax, %rsi > - VPTESTN %VMM(1), %VMM(1), %k1 > - KMOV %k1, %VRDX > - test %VRDX, %VRDX > - jnz L(last_vec_x0) > - > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(last_vec_x1) > - > - VPTESTN %VMM(3), %VMM(3), %k0 > - > - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > - individually, for VEC_SIZE == 32 we combine them in a single > - 64-bit GPR. */ > -# if CHAR_PER_VEC == 64 > - KMOV %k0, %VRDX > - test %VRDX, %VRDX > - jnz L(last_vec_x2) > - KMOV %k2, %VRDX > -# else > - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > - */ > - kmovd %k2, %edx > - kmovd %k0, %eax > - salq $CHAR_PER_VEC, %rdx > - orq %rax, %rdx > -# endif > - > - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > - */ > - bsfq %rdx, %rdx > - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax > - cmpq %rax, %rcx > - cmovb %rcx, %rax > - ret > - > - /* Handle last 4x VEC after loop. All VECs have been loaded. */ > - .p2align 4,, 4 > -L(loop_len_end): > - KORTEST %k0, %k2 > - jnz L(loop_last_4x_vec) > - movq %rsi, %rax > - ret > - > - > -# if CHAR_PER_VEC == 64 > - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 > - need return label for it. */ > - .p2align 4,, 8 > -L(last_vec_x2): > - bsf %VRDX, %VRDX > - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax > - cmpq %rax, %rcx > - cmovb %rcx, %rax > - ret > -# endif > - > - > - .p2align 4,, 10 > -L(last_vec_x1): > - addq $CHAR_PER_VEC, %rsi > -L(last_vec_x0): > - bsf %VRDX, %VRDX > - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax > - cmpq %rax, %rcx > - cmovb %rcx, %rax > - ret > - > - > - .p2align 4,, 8 > -L(cross_page_boundary): > - /* Align data to VEC_SIZE. */ > - movq %rdi, %rcx > - andq $-VEC_SIZE, %rcx > - VPCMPEQ (%rcx), %VZERO, %k0 > - > - KMOV %k0, %VRCX > -# ifdef USE_AS_WCSLEN > - shrl $2, %eax > - andl $(CHAR_PER_VEC - 1), %eax > -# endif > - shrx %VRAX, %VRCX, %VRCX > - > - negl %eax > - andl $(CHAR_PER_VEC - 1), %eax > - movq %rsi, %rdx > - bsf %VRCX, %VRDX > - cmpq %rax, %rdx > - ja L(cross_page_continue) > - movl %edx, %eax > - cmpq %rdx, %rsi > - cmovb %esi, %eax > - ret > -END (STRNLEN) > +#ifndef STRNLEN > +#define STRNLEN __strnlen_evex > #endif > + > +#include "x86-evex256-vecs.h" > +#include "reg-macros.h" > +#include "strnlen-evex-base.S" > \ No newline at end of file > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S > index f8e55883bb..8ef54078f8 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S > @@ -1,264 +1,7 @@ > -/* Placeholder function, not used by any processor at the moment. > - Copyright (C) 2022-2024 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > #ifndef STRNLEN > #define STRNLEN __strnlen_evex512 > #endif > > #include "x86-evex512-vecs.h" > #include "reg-macros.h" > - > -#include <isa-level.h> > - > -#if ISA_SHOULD_BUILD (4) > - > -# include <sysdep.h> > - > -# ifdef USE_AS_WCSLEN > -# define VPCMPEQ vpcmpeqd > -# define VPTESTN vptestnmd > -# define VPMINU vpminud > -# define CHAR_SIZE 4 > -# else > -# define VPCMPEQ vpcmpeqb > -# define VPTESTN vptestnmb > -# define VPMINU vpminub > -# define CHAR_SIZE 1 > -# endif > - > -# define PAGE_SIZE 4096 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > - .section SECTION(.text),"ax",@progbits > -/* Aligning entry point to 64 byte, provides better performance for > - one vector length string. */ > -ENTRY_P2ALIGN (STRNLEN, 6) > - /* Check zero length. */ > - test %RSI_LP, %RSI_LP > - jz L(ret_max) > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %esi, %esi > -# endif > - > - movl %edi, %eax > - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > - sall $20, %eax > - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > - ja L(page_cross) > - > - /* Compare [w]char for null, mask bit will be set for match. */ > - VPCMPEQ (%rdi), %VMM(0), %k0 > - KMOV %k0, %VRCX > - /* Store max length in rax. */ > - mov %rsi, %rax > - /* If rcx is 0, rax will have max length. We can not use VRCX > - and VRAX here for evex256 because, upper 32 bits may be > - undefined for ecx and eax. */ > - bsfq %rcx, %rax > - cmp $CHAR_PER_VEC, %rax > - ja L(align_more) > - cmpq %rax, %rsi > - cmovb %esi, %eax > - ret > - > - /* At this point vector max length reached. */ > - .p2align 4,,3 > -L(ret_max): > - movq %rsi, %rax > - ret > - > -L(align_more): > - mov %rdi, %rax > - /* Align rax to VEC_SIZE. */ > - andq $-VEC_SIZE, %rax > - movq %rdi, %rdx > - subq %rax, %rdx > -# ifdef USE_AS_WCSLEN > - shr $2, %VRDX > -# endif > - /* At this point rdx contains [w]chars already compared. */ > - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx > - /* At this point rdx contains number of w[char] needs to go. > - Now onwards rdx will keep decrementing with each compare. */ > - > - /* Loop unroll 4 times for 4 vector loop. */ > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > - subq $-VEC_SIZE, %rax > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x1) > - > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > - > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x2) > - > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > - > - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x3) > - > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > - > - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x4) > - > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > - /* Save pointer before 4 x VEC_SIZE alignment. */ > - movq %rax, %rcx > - > - /* Align address to VEC_SIZE * 4 for loop. */ > - andq $-(VEC_SIZE * 4), %rax > - > - subq %rax, %rcx > -# ifdef USE_AS_WCSLEN > - shr $2, %VRCX > -# endif > - /* rcx contains number of [w]char will be recompared due to > - alignment fixes. rdx must be incremented by rcx to offset > - alignment adjustment. */ > - addq %rcx, %rdx > - /* Need jump as we don't want to add/subtract rdx for first > - iteration of 4 x VEC_SIZE aligned loop. */ > - > - .p2align 4,,11 > -L(loop): > - /* VPMINU and VPCMP combination provide better performance as > - compared to alternative combinations. */ > - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > - > - VPTESTN %VMM(2), %VMM(2), %k0 > - VPTESTN %VMM(4), %VMM(4), %k1 > - > - subq $-(VEC_SIZE * 4), %rax > - KORTEST %k0, %k1 > - > - jnz L(loopend) > - subq $(CHAR_PER_VEC * 4), %rdx > - ja L(loop) > - mov %rsi, %rax > - ret > - > -L(loopend): > - > - VPTESTN %VMM(1), %VMM(1), %k2 > - KMOV %k2, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x1) > - > - KMOV %k0, %VRCX > - /* At this point, if k0 is non zero, null char must be in the > - second vector. */ > - test %VRCX, %VRCX > - jnz L(ret_vec_x2) > - > - VPTESTN %VMM(3), %VMM(3), %k3 > - KMOV %k3, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x3) > - /* At this point null [w]char must be in the fourth vector so no > - need to check. */ > - KMOV %k1, %VRCX > - > - /* Fourth, third, second vector terminating are pretty much > - same, implemented this way to avoid branching and reuse code > - from pre loop exit condition. */ > -L(ret_vec_x4): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - subq $-(VEC_SIZE * 3), %rax > - shrq $2, %rax > - addq %rcx, %rax > -# else > - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax > -# endif > - > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > - ret > - > -L(ret_vec_x3): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - subq $-(VEC_SIZE * 2), %rax > - shrq $2, %rax > - addq %rcx, %rax > -# else > - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax > -# endif > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > - ret > - > -L(ret_vec_x2): > - subq $-VEC_SIZE, %rax > -L(ret_vec_x1): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - addq %rcx, %rax > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > - ret > - > -L(page_cross): > - mov %rdi, %rax > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > -# ifdef USE_AS_WCSLEN > - sarl $2, %ecx > -# endif > - /* ecx contains number of w[char] to be skipped as a result > - of address alignment. */ > - andq $-VEC_SIZE, %rax > - VPCMPEQ (%rax), %VMM(0), %k0 > - KMOV %k0, %VRDX > - /* Ignore number of character for alignment adjustment. */ > - shr %cl, %VRDX > - jnz L(page_cross_end) > - movl $CHAR_PER_VEC, %eax > - sub %ecx, %eax > - cmp %rax, %rsi > - ja L(align_more) > - > -L(page_cross_end): > - bsf %VRDX, %VRAX > - cmpq %rsi, %rax > - cmovnb %esi, %eax > - ret > - > -END (STRNLEN) > -#endif > +#include "strnlen-evex-base.S" > \ No newline at end of file > -- > 2.37.2 > LGTM Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
On Sun, Aug 18, 2024 at 4:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett > <matthew.sterrett@intel.com> wrote: > > > > This commit uses a common implementation 'strnlen-evex-base.S' for both > > 'strnlen-evex' and 'strnlen-evex512' > > > > This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512. > > > > All tests pass on x86. > > > > Benchmarks were taken on SKX. > > https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html > > > > Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881 > > Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953 > > > > Code Size Changes: > > strnlen-evex : +31 bytes > > strnlen-evex512 : +156 bytes > > --- > > sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/strnlen-evex.S | 428 +---------------- > > sysdeps/x86_64/multiarch/strnlen-evex512.S | 259 +---------- > > 3 files changed, 469 insertions(+), 680 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S > > > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S > > new file mode 100644 > > index 0000000000..1c2cfdfe06 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S > > @@ -0,0 +1,462 @@ > > +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions. > > + Copyright (C) 2022-2024 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +#ifdef USE_AS_WCSLEN > > +# define VPCMPEQ vpcmpeqd > > +# define VPTESTN vptestnmd > > +# define VPMINU vpminud > > +# define CHAR_SIZE 4 > > +#else > > +# define VPCMPEQ vpcmpeqb > > +# define VPTESTN vptestnmb > > +# define VPMINU vpminub > > +# define CHAR_SIZE 1 > > +#endif > > + > > +#define XZERO VMM_128(0) > > +#define VZERO VMM(0) > > +#define PAGE_SIZE 4096 > > +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > +#if CHAR_PER_VEC == 32 > > +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > > +#else > > +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > > +#endif > > + > > +#ifdef USE_AS_WCSLEN > > +/* For wide-character, we care more about limitting code size > > + than optimally aligning targets, so just cap nop padding > > + reasonably low. */ > > +# define P2ALIGN(...) .p2align 4,, 6 > > +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__) > > +#else > > +# define P2ALIGN(x) .p2align x > > +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y > > +#endif > > + > > + .section SECTION(.text), "ax", @progbits > > + /* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN(STRNLEN, 6) > > + /* rdi is pointer to array, rsi is the upper limit. */ > > + > > + /* Check zero length. */ > > + test %RSI_LP, %RSI_LP > > + jz L(zero) > > + > > +#ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %esi, %esi > > +#endif > > + > > + vpxorq %XZERO, %XZERO, %XZERO > > + > > + /* Check that we won't cross a page boundary with our first load. */ > > + movl %edi, %eax > > + shll $20, %eax > > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > > + ja L(crosses_page_boundary) > > + > > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > > + null byte. */ > > + VPCMPEQ (%rdi), %VZERO, %k0 > > + KMOV %k0, %VRCX > > + > > + /* If src (rcx) is zero, bsf does not change the result. NB: > > + Must use 64-bit bsf here so that upper bits of len are not > > + cleared. */ > > + movq %rsi, %rax > > + bsfq %rcx, %rax > > + > > + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > > + CHAR) and rsi must be > CHAR_PER_VEC. */ > > + cmpq $CHAR_PER_VEC, %rax > > + ja L(more_1x_vec) > > + > > + /* Check if first match in bounds. */ > > + cmpq %rax, %rsi > > + cmovb %esi, %eax > > + ret > > + > > +#if VEC_SIZE == 32 > > + P2ALIGN_CLAMPED(4, 2) > > +L(zero): > > +L(max_0): > > + movl %esi, %eax > > + ret > > +#endif > > + > > + P2ALIGN_CLAMPED(4, 10) > > +L(more_1x_vec): > > +L(cross_page_continue): > > + /* After this calculation, rax stores the number of elements > > + left to be processed The complexity comes from the fact some > > + elements get read twice due to alignment and we need to be > > + sure we don't count them twice (else, it would just be rsi - > > + CHAR_PER_VEC). */ > > + > > +#ifdef USE_AS_WCSLEN > > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > > + overflow. */ > > + movq %rdi, %rax > > + andq $(VEC_SIZE * -1), %rdi > > + subq %rdi, %rax > > + sarq $2, %rax > > + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > > +#else > > + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits, > > + then subtract ptr to get the new aligned limit value. */ > > + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > > + andq $(VEC_SIZE * -1), %rdi > > + subq %rdi, %rax > > +#endif > > + > > + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > > + > > + /* Checking here is faster for 256-bit but not 512-bit */ > > +#if VEC_SIZE == 0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(last_vec_check) > > +#endif > > + > > + cmpq $(CHAR_PER_VEC * 2), %rax > > + ja L(more_2x_vec) > > + > > +L(last_2x_vec_or_less): > > + > > + /* Checking here is faster for 512-bit but not 256-bit */ > > +#if VEC_SIZE != 0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(last_vec_check) > > +#endif > > + > > + /* Check for the end of data. */ > > + SUB_SHORT (CHAR_PER_VEC, rax) > > + jbe L(max_0) > > + > > + /* Check the final remaining vector. */ > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > +#if VEC_SIZE == 32 > > + jz L(max_0) > > +#else > > + jnz L(last_vec_check) > > + P2ALIGN_CLAMPED(4, 2) > > +L(zero): > > +L(max_0): > > + movl %esi, %eax > > + ret > > + > > +#endif > > + P2ALIGN_CLAMPED(4, 4) > > +L(last_vec_check): > > + bsf %VRDX, %VRDX > > + sub %eax, %edx > > + lea (%rsi, %rdx), %eax > > + cmovae %esi, %eax > > + ret > > + > > + > > +#if VEC_SIZE == 32 > > + P2ALIGN_CLAMPED(4, 8) > > +#endif > > +L(last_4x_vec_or_less): > > + addl $(CHAR_PER_VEC * -4), %eax > > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > > + > > +#if VEC_SIZE == 64 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(last_vec_check) > > +#endif > > + > > + subq $(VEC_SIZE * -4), %rdi > > + cmpl $(CHAR_PER_VEC * 2), %eax > > + jbe L(last_2x_vec_or_less) > > + > > + P2ALIGN_CLAMPED(4, 6) > > +L(more_2x_vec): > > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > > + rechecking bounds. */ > > + > > + /* Already checked in 256-bit case */ > > +#if VEC_SIZE != 0 > > + KMOV %k0, %VRDX > > + > > + test %VRDX, %VRDX > > + jnz L(first_vec_x1) > > +#endif > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + > > + test %VRDX, %VRDX > > + jnz L(first_vec_x2) > > + > > + cmpq $(CHAR_PER_VEC * 4), %rax > > + ja L(more_4x_vec) > > + > > + > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + addl $(CHAR_PER_VEC * -2), %eax > > + test %VRDX, %VRDX > > + jnz L(last_vec_check) > > + > > + subb $(CHAR_PER_VEC), %al > > + jbe L(max_1) > > + > > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + > > + test %VRDX, %VRDX > > + jnz L(last_vec_check) > > +L(max_1): > > + movl %esi, %eax > > + ret > > + > > + > > + P2ALIGN_CLAMPED(4, 14) > > +L(first_vec_x2): > > +#if VEC_SIZE == 64 > > + /* If VEC_SIZE == 64 we can fit logic for full return label in > > + spare bytes before next cache line. */ > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > > + ret > > + P2ALIGN_CLAMPED(4, 6) > > +#else > > + addl $CHAR_PER_VEC, %esi > > +#endif > > +L(first_vec_x1): > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > > + ret > > + > > +#if VEC_SIZE == 64 > > + P2ALIGN_CLAMPED(4, 6) > > +L(first_vec_x4): > > +# if VEC_SIZE == 64 > > + /* If VEC_SIZE == 64 we can fit logic for full return label in > > + spare bytes before next cache line. */ > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > > + ret > > + P2ALIGN_CLAMPED(4, 6) > > +# else > > + addl $CHAR_PER_VEC, %esi > > +# endif > > +L(first_vec_x3): > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > > + ret > > +#endif > > + > > + P2ALIGN_CLAMPED(6, 20) > > +L(more_4x_vec): > > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(first_vec_x3) > > + > > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(first_vec_x4) > > + > > + /* Check if at last VEC_SIZE * 4 length before aligning for the > > + loop. */ > > + cmpq $(CHAR_PER_VEC * 8), %rax > > + jbe L(last_4x_vec_or_less) > > + > > + > > + /* Compute number of words checked after aligning. */ > > +#ifdef USE_AS_WCSLEN > > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > > + overflow. */ > > + leaq (VEC_SIZE * -3)(%rdi), %rdx > > +#else > > + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > > +#endif > > + > > + subq $(VEC_SIZE * -1), %rdi > > + > > + /* Align data to VEC_SIZE * 4. */ > > +#if VEC_SIZE == 64 > > + /* Saves code size. No evex512 processor has partial register > > + stalls. If that change this can be replaced with `andq > > + $-(VEC_SIZE * 4), %rdi`. */ > > + xorb %dil, %dil > > +#else > > + andq $-(VEC_SIZE * 4), %rdi > > +#endif > > + > > +#ifdef USE_AS_WCSLEN > > + subq %rdi, %rdx > > + sarq $2, %rdx > > + addq %rdx, %rax > > +#else > > + subq %rdi, %rax > > +#endif > > + > > + // mov %rdi, %rdx > > + > > + P2ALIGN(6) > > +L(loop): > > + /* VPMINU and VPCMP combination provide better performance as > > + compared to alternative combinations. */ > > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > > + > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + VPTESTN %VMM(4), %VMM(4), %k1 > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + KORTEST %k0, %k1 > > + > > + jnz L(loopend) > > + subq $(CHAR_PER_VEC * 4), %rax > > + ja L(loop) > > + mov %rsi, %rax > > + ret > > + > > + > > +#if VEC_SIZE == 32 > > + P2ALIGN_CLAMPED(4, 6) > > +L(first_vec_x4): > > +# if VEC_SIZE == 64 > > + /* If VEC_SIZE == 64 we can fit logic for full return label in > > + spare bytes before next cache line. */ > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > > + ret > > + P2ALIGN_CLAMPED(4, 6) > > +# else > > + addl $CHAR_PER_VEC, %esi > > +# endif > > +L(first_vec_x3): > > + bsf %VRDX, %VRDX > > + sub %eax, %esi > > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > > + ret > > +#endif > > + > > + > > + P2ALIGN_CLAMPED(4, 11) > > +L(loopend): > > + /* We found a null terminator in one of the 4 vectors. */ > > + > > + /* Check the first vector. */ > > + movq %rax, %r8 > > + VPTESTN %VMM(1), %VMM(1), %k2 > > + KMOV %k2, %VRCX > > + bsf %rcx, %r8 > > + > > + cmpq $(CHAR_PER_VEC), %r8 > > + jbe L(end_vec) > > + > > + /* Check the second vector. */ > > + subq $(CHAR_PER_VEC), %rax > > + movq %rax, %r8 > > + KMOV %k0, %VRCX > > + bsf %rcx, %r8 > > + > > + cmpq $(CHAR_PER_VEC), %r8 > > + jbe L(end_vec) > > + > > + /* Check the third vector. */ > > + subq $(CHAR_PER_VEC), %rax > > + movq %rax, %r8 > > + VPTESTN %VMM(3), %VMM(3), %k2 > > + KMOV %k2, %VRCX > > + bsf %rcx, %r8 > > + > > + cmpq $(CHAR_PER_VEC), %r8 > > + jbe L(end_vec) > > + > > + /* It is in the fourth vector. */ > > + subq $(CHAR_PER_VEC), %rax > > + movq %rax, %r8 > > + KMOV %k1, %VRCX > > + bsf %rcx, %r8 > > + > > + P2ALIGN_CLAMPED(4, 3) > > +L(end_vec): > > + /* Get the number that has been processed. */ > > + movq %rsi, %rcx > > + subq %rax, %rcx > > + > > + /* Add that to the offset we found the null terminator at. */ > > + leaq (%r8, %rcx), %rax > > + > > + /* Take the min of that and the limit. */ > > + cmpq %rsi, %rax > > + cmovnb %rsi, %rax > > + ret > > + > > + P2ALIGN_CLAMPED(4, 11) > > +L(crosses_page_boundary): > > + /* Align data backwards to VEC_SIZE. */ > > + shrl $20, %eax > > + movq %rdi, %rcx > > + andq $-VEC_SIZE, %rcx > > + VPCMPEQ (%rcx), %VZERO, %k0 > > + > > + KMOV %k0, %VRCX > > +#ifdef USE_AS_WCSLEN > > + shrl $2, %eax > > + andl $(CHAR_PER_VEC - 1), %eax > > +#endif > > + /* By this point rax contains number of bytes we need to skip. */ > > + shrx %VRAX, %VRCX, %VRCX > > + > > + /* Calculates CHAR_PER_VEC - eax and stores in eax. */ > > + negl %eax > > + andl $(CHAR_PER_VEC - 1), %eax > > + > > + movq %rsi, %rdx > > + bsf %VRCX, %VRDX > > + cmpq %rax, %rdx > > + ja L(cross_page_continue) > > + > > + /* The vector had a null terminator or we are at the limit. */ > > + movl %edx, %eax > > + cmpq %rdx, %rsi > > + cmovb %esi, %eax > > + ret > > + > > +END(STRNLEN) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S > > index 91b16830eb..cba585452c 100644 > > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S > > @@ -1,423 +1,7 @@ > > -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. > > - Copyright (C) 2022-2024 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > -#include <isa-level.h> > > -#include <sysdep.h> > > - > > -#if ISA_SHOULD_BUILD (4) > > - > > -# ifndef VEC_SIZE > > -# include "x86-evex256-vecs.h" > > -# endif > > - > > - > > -# ifndef STRNLEN > > -# define STRNLEN __strnlen_evex > > -# endif > > - > > -# ifdef USE_AS_WCSLEN > > -# define VPCMPEQ vpcmpeqd > > -# define VPCMPNEQ vpcmpneqd > > -# define VPTESTN vptestnmd > > -# define VPTEST vptestmd > > -# define VPMINU vpminud > > -# define CHAR_SIZE 4 > > - > > -# else > > -# define VPCMPEQ vpcmpeqb > > -# define VPCMPNEQ vpcmpneqb > > -# define VPTESTN vptestnmb > > -# define VPTEST vptestmb > > -# define VPMINU vpminub > > -# define CHAR_SIZE 1 > > - > > -# define REG_WIDTH VEC_SIZE > > -# endif > > - > > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > - > > -# include "reg-macros.h" > > - > > -# if CHAR_PER_VEC == 32 > > -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > > -# else > > -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > > -# endif > > - > > - > > - > > -# if CHAR_PER_VEC == 64 > > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > > -# else > > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > > -# endif > > - > > - > > -# define XZERO VMM_128(0) > > -# define VZERO VMM(0) > > -# define PAGE_SIZE 4096 > > - > > - .section SECTION(.text), "ax", @progbits > > -ENTRY_P2ALIGN (STRNLEN, 6) > > - /* Check zero length. */ > > - test %RSI_LP, %RSI_LP > > - jz L(zero) > > -# ifdef __ILP32__ > > - /* Clear the upper 32 bits. */ > > - movl %esi, %esi > > -# endif > > - > > - movl %edi, %eax > > - vpxorq %XZERO, %XZERO, %XZERO > > - andl $(PAGE_SIZE - 1), %eax > > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > - ja L(cross_page_boundary) > > - > > - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > > - null byte. */ > > - VPCMPEQ (%rdi), %VZERO, %k0 > > - > > - KMOV %k0, %VRCX > > - movq %rsi, %rax > > - > > - /* If src (rcx) is zero, bsf does not change the result. NB: > > - Must use 64-bit bsf here so that upper bits of len are not > > - cleared. */ > > - bsfq %rcx, %rax > > - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > > - CHAR) and rsi must be > CHAR_PER_VEC. */ > > - cmpq $CHAR_PER_VEC, %rax > > - ja L(more_1x_vec) > > - /* Check if first match in bounds. */ > > - cmpq %rax, %rsi > > - cmovb %esi, %eax > > - ret > > - > > - > > -# if CHAR_PER_VEC != 32 > > - .p2align 4,, 2 > > -L(zero): > > -L(max_0): > > - movl %esi, %eax > > - ret > > -# endif > > - > > - /* Aligned more for strnlen compares remaining length vs 2 * > > - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > > - going to the loop. */ > > - .p2align 4,, 10 > > -L(more_1x_vec): > > -L(cross_page_continue): > > - /* Compute number of words checked after aligning. */ > > -# ifdef USE_AS_WCSLEN > > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > > - overflow. */ > > - movq %rdi, %rax > > - andq $(VEC_SIZE * -1), %rdi > > - subq %rdi, %rax > > - sarq $2, %rax > > - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > > -# else > > - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > > - andq $(VEC_SIZE * -1), %rdi > > - subq %rdi, %rax > > -# endif > > - > > - > > - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > > - > > - cmpq $(CHAR_PER_VEC * 2), %rax > > - ja L(more_2x_vec) > > - > > -L(last_2x_vec_or_less): > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(last_vec_check) > > - > > - /* Check the end of data. */ > > - SUB_SHORT (CHAR_PER_VEC, rax) > > - jbe L(max_0) > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jz L(max_0) > > - /* Best place for LAST_VEC_CHECK if ZMM. */ > > - .p2align 4,, 8 > > -L(last_vec_check): > > - bsf %VRDX, %VRDX > > - sub %eax, %edx > > - lea (%rsi, %rdx), %eax > > - cmovae %esi, %eax > > - ret > > - > > -# if CHAR_PER_VEC == 32 > > - .p2align 4,, 2 > > -L(zero): > > -L(max_0): > > - movl %esi, %eax > > - ret > > -# endif > > - > > - .p2align 4,, 8 > > -L(last_4x_vec_or_less): > > - addl $(CHAR_PER_VEC * -4), %eax > > - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > > - subq $(VEC_SIZE * -4), %rdi > > - cmpl $(CHAR_PER_VEC * 2), %eax > > - jbe L(last_2x_vec_or_less) > > - > > - .p2align 4,, 6 > > -L(more_2x_vec): > > - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > > - rechecking bounds. */ > > - > > - KMOV %k0, %VRDX > > - > > - test %VRDX, %VRDX > > - jnz L(first_vec_x1) > > - > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(first_vec_x2) > > - > > - cmpq $(CHAR_PER_VEC * 4), %rax > > - ja L(more_4x_vec) > > - > > - > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - addl $(CHAR_PER_VEC * -2), %eax > > - test %VRDX, %VRDX > > - jnz L(last_vec_check) > > - > > - subl $(CHAR_PER_VEC), %eax > > - jbe L(max_1) > > - > > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - > > - test %VRDX, %VRDX > > - jnz L(last_vec_check) > > -L(max_1): > > - movl %esi, %eax > > - ret > > - > > - .p2align 4,, 3 > > -L(first_vec_x2): > > -# if VEC_SIZE == 64 > > - /* If VEC_SIZE == 64 we can fit logic for full return label in > > - spare bytes before next cache line. */ > > - bsf %VRDX, %VRDX > > - sub %eax, %esi > > - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > > - ret > > - .p2align 4,, 6 > > -# else > > - addl $CHAR_PER_VEC, %esi > > -# endif > > -L(first_vec_x1): > > - bsf %VRDX, %VRDX > > - sub %eax, %esi > > - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > > - ret > > - > > - > > - .p2align 4,, 6 > > -L(first_vec_x4): > > -# if VEC_SIZE == 64 > > - /* If VEC_SIZE == 64 we can fit logic for full return label in > > - spare bytes before next cache line. */ > > - bsf %VRDX, %VRDX > > - sub %eax, %esi > > - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > > - ret > > - .p2align 4,, 6 > > -# else > > - addl $CHAR_PER_VEC, %esi > > -# endif > > -L(first_vec_x3): > > - bsf %VRDX, %VRDX > > - sub %eax, %esi > > - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > > - ret > > - > > - .p2align 4,, 5 > > -L(more_4x_vec): > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(first_vec_x3) > > - > > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(first_vec_x4) > > - > > - /* Check if at last VEC_SIZE * 4 length before aligning for the > > - loop. */ > > - cmpq $(CHAR_PER_VEC * 8), %rax > > - jbe L(last_4x_vec_or_less) > > - > > - > > - /* Compute number of words checked after aligning. */ > > -# ifdef USE_AS_WCSLEN > > - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > > - overflow. */ > > - leaq (VEC_SIZE * -3)(%rdi), %rdx > > -# else > > - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > > -# endif > > - > > - subq $(VEC_SIZE * -1), %rdi > > - > > - /* Align data to VEC_SIZE * 4. */ > > -# if VEC_SIZE == 64 > > - /* Saves code size. No evex512 processor has partial register > > - stalls. If that change this can be replaced with `andq > > - $-(VEC_SIZE * 4), %rdi`. */ > > - xorb %dil, %dil > > -# else > > - andq $-(VEC_SIZE * 4), %rdi > > -# endif > > - > > -# ifdef USE_AS_WCSLEN > > - subq %rdi, %rdx > > - sarq $2, %rdx > > - addq %rdx, %rax > > -# else > > - subq %rdi, %rax > > -# endif > > - /* Compare 4 * VEC at a time forward. */ > > - .p2align 4,, 11 > > -L(loop_4x_vec): > > - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > > - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > > - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > > - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > > - VPTESTN %VMM(2), %VMM(2), %k0 > > - VPTESTN %VMM(4), %VMM(4), %k2 > > - subq $-(VEC_SIZE * 4), %rdi > > - /* Break if at end of length. */ > > - subq $(CHAR_PER_VEC * 4), %rax > > - jbe L(loop_len_end) > > - > > - > > - KORTEST %k0, %k2 > > - jz L(loop_4x_vec) > > - > > - > > -L(loop_last_4x_vec): > > - movq %rsi, %rcx > > - subq %rax, %rsi > > - VPTESTN %VMM(1), %VMM(1), %k1 > > - KMOV %k1, %VRDX > > - test %VRDX, %VRDX > > - jnz L(last_vec_x0) > > - > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(last_vec_x1) > > - > > - VPTESTN %VMM(3), %VMM(3), %k0 > > - > > - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > > - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > > - individually, for VEC_SIZE == 32 we combine them in a single > > - 64-bit GPR. */ > > -# if CHAR_PER_VEC == 64 > > - KMOV %k0, %VRDX > > - test %VRDX, %VRDX > > - jnz L(last_vec_x2) > > - KMOV %k2, %VRDX > > -# else > > - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > > - */ > > - kmovd %k2, %edx > > - kmovd %k0, %eax > > - salq $CHAR_PER_VEC, %rdx > > - orq %rax, %rdx > > -# endif > > - > > - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > > - */ > > - bsfq %rdx, %rdx > > - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax > > - cmpq %rax, %rcx > > - cmovb %rcx, %rax > > - ret > > - > > - /* Handle last 4x VEC after loop. All VECs have been loaded. */ > > - .p2align 4,, 4 > > -L(loop_len_end): > > - KORTEST %k0, %k2 > > - jnz L(loop_last_4x_vec) > > - movq %rsi, %rax > > - ret > > - > > - > > -# if CHAR_PER_VEC == 64 > > - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 > > - need return label for it. */ > > - .p2align 4,, 8 > > -L(last_vec_x2): > > - bsf %VRDX, %VRDX > > - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax > > - cmpq %rax, %rcx > > - cmovb %rcx, %rax > > - ret > > -# endif > > - > > - > > - .p2align 4,, 10 > > -L(last_vec_x1): > > - addq $CHAR_PER_VEC, %rsi > > -L(last_vec_x0): > > - bsf %VRDX, %VRDX > > - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax > > - cmpq %rax, %rcx > > - cmovb %rcx, %rax > > - ret > > - > > - > > - .p2align 4,, 8 > > -L(cross_page_boundary): > > - /* Align data to VEC_SIZE. */ > > - movq %rdi, %rcx > > - andq $-VEC_SIZE, %rcx > > - VPCMPEQ (%rcx), %VZERO, %k0 > > - > > - KMOV %k0, %VRCX > > -# ifdef USE_AS_WCSLEN > > - shrl $2, %eax > > - andl $(CHAR_PER_VEC - 1), %eax > > -# endif > > - shrx %VRAX, %VRCX, %VRCX > > - > > - negl %eax > > - andl $(CHAR_PER_VEC - 1), %eax > > - movq %rsi, %rdx > > - bsf %VRCX, %VRDX > > - cmpq %rax, %rdx > > - ja L(cross_page_continue) > > - movl %edx, %eax > > - cmpq %rdx, %rsi > > - cmovb %esi, %eax > > - ret > > -END (STRNLEN) > > +#ifndef STRNLEN > > +#define STRNLEN __strnlen_evex > > #endif > > + > > +#include "x86-evex256-vecs.h" > > +#include "reg-macros.h" > > +#include "strnlen-evex-base.S" > > \ No newline at end of file > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S > > index f8e55883bb..8ef54078f8 100644 > > --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S > > @@ -1,264 +1,7 @@ > > -/* Placeholder function, not used by any processor at the moment. > > - Copyright (C) 2022-2024 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > #ifndef STRNLEN > > #define STRNLEN __strnlen_evex512 > > #endif > > > > #include "x86-evex512-vecs.h" > > #include "reg-macros.h" > > - > > -#include <isa-level.h> > > - > > -#if ISA_SHOULD_BUILD (4) > > - > > -# include <sysdep.h> > > - > > -# ifdef USE_AS_WCSLEN > > -# define VPCMPEQ vpcmpeqd > > -# define VPTESTN vptestnmd > > -# define VPMINU vpminud > > -# define CHAR_SIZE 4 > > -# else > > -# define VPCMPEQ vpcmpeqb > > -# define VPTESTN vptestnmb > > -# define VPMINU vpminub > > -# define CHAR_SIZE 1 > > -# endif > > - > > -# define PAGE_SIZE 4096 > > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > - > > - .section SECTION(.text),"ax",@progbits > > -/* Aligning entry point to 64 byte, provides better performance for > > - one vector length string. */ > > -ENTRY_P2ALIGN (STRNLEN, 6) > > - /* Check zero length. */ > > - test %RSI_LP, %RSI_LP > > - jz L(ret_max) > > -# ifdef __ILP32__ > > - /* Clear the upper 32 bits. */ > > - movl %esi, %esi > > -# endif > > - > > - movl %edi, %eax > > - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > > - sall $20, %eax > > - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > > - ja L(page_cross) > > - > > - /* Compare [w]char for null, mask bit will be set for match. */ > > - VPCMPEQ (%rdi), %VMM(0), %k0 > > - KMOV %k0, %VRCX > > - /* Store max length in rax. */ > > - mov %rsi, %rax > > - /* If rcx is 0, rax will have max length. We can not use VRCX > > - and VRAX here for evex256 because, upper 32 bits may be > > - undefined for ecx and eax. */ > > - bsfq %rcx, %rax > > - cmp $CHAR_PER_VEC, %rax > > - ja L(align_more) > > - cmpq %rax, %rsi > > - cmovb %esi, %eax > > - ret > > - > > - /* At this point vector max length reached. */ > > - .p2align 4,,3 > > -L(ret_max): > > - movq %rsi, %rax > > - ret > > - > > -L(align_more): > > - mov %rdi, %rax > > - /* Align rax to VEC_SIZE. */ > > - andq $-VEC_SIZE, %rax > > - movq %rdi, %rdx > > - subq %rax, %rdx > > -# ifdef USE_AS_WCSLEN > > - shr $2, %VRDX > > -# endif > > - /* At this point rdx contains [w]chars already compared. */ > > - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx > > - /* At this point rdx contains number of w[char] needs to go. > > - Now onwards rdx will keep decrementing with each compare. */ > > - > > - /* Loop unroll 4 times for 4 vector loop. */ > > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > > - subq $-VEC_SIZE, %rax > > - KMOV %k0, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x1) > > - > > - subq $CHAR_PER_VEC, %rdx > > - jbe L(ret_max) > > - > > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > > - KMOV %k0, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x2) > > - > > - subq $CHAR_PER_VEC, %rdx > > - jbe L(ret_max) > > - > > - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > > - KMOV %k0, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x3) > > - > > - subq $CHAR_PER_VEC, %rdx > > - jbe L(ret_max) > > - > > - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > > - KMOV %k0, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x4) > > - > > - subq $CHAR_PER_VEC, %rdx > > - jbe L(ret_max) > > - /* Save pointer before 4 x VEC_SIZE alignment. */ > > - movq %rax, %rcx > > - > > - /* Align address to VEC_SIZE * 4 for loop. */ > > - andq $-(VEC_SIZE * 4), %rax > > - > > - subq %rax, %rcx > > -# ifdef USE_AS_WCSLEN > > - shr $2, %VRCX > > -# endif > > - /* rcx contains number of [w]char will be recompared due to > > - alignment fixes. rdx must be incremented by rcx to offset > > - alignment adjustment. */ > > - addq %rcx, %rdx > > - /* Need jump as we don't want to add/subtract rdx for first > > - iteration of 4 x VEC_SIZE aligned loop. */ > > - > > - .p2align 4,,11 > > -L(loop): > > - /* VPMINU and VPCMP combination provide better performance as > > - compared to alternative combinations. */ > > - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > > - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > > - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > > - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > > - > > - VPTESTN %VMM(2), %VMM(2), %k0 > > - VPTESTN %VMM(4), %VMM(4), %k1 > > - > > - subq $-(VEC_SIZE * 4), %rax > > - KORTEST %k0, %k1 > > - > > - jnz L(loopend) > > - subq $(CHAR_PER_VEC * 4), %rdx > > - ja L(loop) > > - mov %rsi, %rax > > - ret > > - > > -L(loopend): > > - > > - VPTESTN %VMM(1), %VMM(1), %k2 > > - KMOV %k2, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x1) > > - > > - KMOV %k0, %VRCX > > - /* At this point, if k0 is non zero, null char must be in the > > - second vector. */ > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x2) > > - > > - VPTESTN %VMM(3), %VMM(3), %k3 > > - KMOV %k3, %VRCX > > - test %VRCX, %VRCX > > - jnz L(ret_vec_x3) > > - /* At this point null [w]char must be in the fourth vector so no > > - need to check. */ > > - KMOV %k1, %VRCX > > - > > - /* Fourth, third, second vector terminating are pretty much > > - same, implemented this way to avoid branching and reuse code > > - from pre loop exit condition. */ > > -L(ret_vec_x4): > > - bsf %VRCX, %VRCX > > - subq %rdi, %rax > > -# ifdef USE_AS_WCSLEN > > - subq $-(VEC_SIZE * 3), %rax > > - shrq $2, %rax > > - addq %rcx, %rax > > -# else > > - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax > > -# endif > > - > > - cmpq %rsi, %rax > > - cmovnb %rsi, %rax > > - ret > > - > > -L(ret_vec_x3): > > - bsf %VRCX, %VRCX > > - subq %rdi, %rax > > -# ifdef USE_AS_WCSLEN > > - subq $-(VEC_SIZE * 2), %rax > > - shrq $2, %rax > > - addq %rcx, %rax > > -# else > > - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax > > -# endif > > - cmpq %rsi, %rax > > - cmovnb %rsi, %rax > > - ret > > - > > -L(ret_vec_x2): > > - subq $-VEC_SIZE, %rax > > -L(ret_vec_x1): > > - bsf %VRCX, %VRCX > > - subq %rdi, %rax > > -# ifdef USE_AS_WCSLEN > > - shrq $2, %rax > > -# endif > > - addq %rcx, %rax > > - cmpq %rsi, %rax > > - cmovnb %rsi, %rax > > - ret > > - > > -L(page_cross): > > - mov %rdi, %rax > > - movl %edi, %ecx > > - andl $(VEC_SIZE - 1), %ecx > > -# ifdef USE_AS_WCSLEN > > - sarl $2, %ecx > > -# endif > > - /* ecx contains number of w[char] to be skipped as a result > > - of address alignment. */ > > - andq $-VEC_SIZE, %rax > > - VPCMPEQ (%rax), %VMM(0), %k0 > > - KMOV %k0, %VRDX > > - /* Ignore number of character for alignment adjustment. */ > > - shr %cl, %VRDX > > - jnz L(page_cross_end) > > - movl $CHAR_PER_VEC, %eax > > - sub %ecx, %eax > > - cmp %rax, %rsi > > - ja L(align_more) > > - > > -L(page_cross_end): > > - bsf %VRDX, %VRAX > > - cmpq %rsi, %rax > > - cmovnb %esi, %eax > > - ret > > - > > -END (STRNLEN) > > -#endif > > +#include "strnlen-evex-base.S" > > \ No newline at end of file > > -- > > 2.37.2 > > > > LGTM > > Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com> I pushed onto master branch.
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S new file mode 100644 index 0000000000..1c2cfdfe06 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S @@ -0,0 +1,462 @@ +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions. + Copyright (C) 2022-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +#ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPTESTN vptestnmd +# define VPMINU vpminud +# define CHAR_SIZE 4 +#else +# define VPCMPEQ vpcmpeqb +# define VPTESTN vptestnmb +# define VPMINU vpminub +# define CHAR_SIZE 1 +#endif + +#define XZERO VMM_128(0) +#define VZERO VMM(0) +#define PAGE_SIZE 4096 +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +#if CHAR_PER_VEC == 32 +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) +#else +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) +#endif + +#ifdef USE_AS_WCSLEN +/* For wide-character, we care more about limitting code size + than optimally aligning targets, so just cap nop padding + reasonably low. */ +# define P2ALIGN(...) .p2align 4,, 6 +# define P2ALIGN_CLAMPED(...) P2ALIGN(__VA_ARGS__) +#else +# define P2ALIGN(x) .p2align x +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y +#endif + + .section SECTION(.text), "ax", @progbits + /* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN(STRNLEN, 6) + /* rdi is pointer to array, rsi is the upper limit. */ + + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) + +#ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +#endif + + vpxorq %XZERO, %XZERO, %XZERO + + /* Check that we won't cross a page boundary with our first load. */ + movl %edi, %eax + shll $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(crosses_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 + KMOV %k0, %VRCX + + /* If src (rcx) is zero, bsf does not change the result. NB: + Must use 64-bit bsf here so that upper bits of len are not + cleared. */ + movq %rsi, %rax + bsfq %rcx, %rax + + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null + CHAR) and rsi must be > CHAR_PER_VEC. */ + cmpq $CHAR_PER_VEC, %rax + ja L(more_1x_vec) + + /* Check if first match in bounds. */ + cmpq %rax, %rsi + cmovb %esi, %eax + ret + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 2) +L(zero): +L(max_0): + movl %esi, %eax + ret +#endif + + P2ALIGN_CLAMPED(4, 10) +L(more_1x_vec): +L(cross_page_continue): + /* After this calculation, rax stores the number of elements + left to be processed The complexity comes from the fact some + elements get read twice due to alignment and we need to be + sure we don't count them twice (else, it would just be rsi - + CHAR_PER_VEC). */ + +#ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + movq %rdi, %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax +#else + /* Calculate ptr + N - VEC_SIZE, then mask off the low bits, + then subtract ptr to get the new aligned limit value. */ + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax +#endif + + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 + + /* Checking here is faster for 256-bit but not 512-bit */ +#if VEC_SIZE == 0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + cmpq $(CHAR_PER_VEC * 2), %rax + ja L(more_2x_vec) + +L(last_2x_vec_or_less): + + /* Checking here is faster for 512-bit but not 256-bit */ +#if VEC_SIZE != 0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + /* Check for the end of data. */ + SUB_SHORT (CHAR_PER_VEC, rax) + jbe L(max_0) + + /* Check the final remaining vector. */ + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX +#if VEC_SIZE == 32 + jz L(max_0) +#else + jnz L(last_vec_check) + P2ALIGN_CLAMPED(4, 2) +L(zero): +L(max_0): + movl %esi, %eax + ret + +#endif + P2ALIGN_CLAMPED(4, 4) +L(last_vec_check): + bsf %VRDX, %VRDX + sub %eax, %edx + lea (%rsi, %rdx), %eax + cmovae %esi, %eax + ret + + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 8) +#endif +L(last_4x_vec_or_less): + addl $(CHAR_PER_VEC * -4), %eax + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 + +#if VEC_SIZE == 64 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) +#endif + + subq $(VEC_SIZE * -4), %rdi + cmpl $(CHAR_PER_VEC * 2), %eax + jbe L(last_2x_vec_or_less) + + P2ALIGN_CLAMPED(4, 6) +L(more_2x_vec): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + + /* Already checked in 256-bit case */ +#if VEC_SIZE != 0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(first_vec_x1) +#endif + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(first_vec_x2) + + cmpq $(CHAR_PER_VEC * 4), %rax + ja L(more_4x_vec) + + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + addl $(CHAR_PER_VEC * -2), %eax + test %VRDX, %VRDX + jnz L(last_vec_check) + + subb $(CHAR_PER_VEC), %al + jbe L(max_1) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(last_vec_check) +L(max_1): + movl %esi, %eax + ret + + + P2ALIGN_CLAMPED(4, 14) +L(first_vec_x2): +#if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +#else + addl $CHAR_PER_VEC, %esi +#endif +L(first_vec_x1): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax + ret + +#if VEC_SIZE == 64 + P2ALIGN_CLAMPED(4, 6) +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret +#endif + + P2ALIGN_CLAMPED(6, 20) +L(more_4x_vec): + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x4) + + /* Check if at last VEC_SIZE * 4 length before aligning for the + loop. */ + cmpq $(CHAR_PER_VEC * 8), %rax + jbe L(last_4x_vec_or_less) + + + /* Compute number of words checked after aligning. */ +#ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + leaq (VEC_SIZE * -3)(%rdi), %rdx +#else + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax +#endif + + subq $(VEC_SIZE * -1), %rdi + + /* Align data to VEC_SIZE * 4. */ +#if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +#else + andq $-(VEC_SIZE * 4), %rdi +#endif + +#ifdef USE_AS_WCSLEN + subq %rdi, %rdx + sarq $2, %rdx + addq %rdx, %rax +#else + subq %rdi, %rax +#endif + + // mov %rdi, %rdx + + P2ALIGN(6) +L(loop): + /* VPMINU and VPCMP combination provide better performance as + compared to alternative combinations. */ + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k1 + + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k0, %k1 + + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rax + ja L(loop) + mov %rsi, %rax + ret + + +#if VEC_SIZE == 32 + P2ALIGN_CLAMPED(4, 6) +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + P2ALIGN_CLAMPED(4, 6) +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret +#endif + + + P2ALIGN_CLAMPED(4, 11) +L(loopend): + /* We found a null terminator in one of the 4 vectors. */ + + /* Check the first vector. */ + movq %rax, %r8 + VPTESTN %VMM(1), %VMM(1), %k2 + KMOV %k2, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* Check the second vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + KMOV %k0, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* Check the third vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + VPTESTN %VMM(3), %VMM(3), %k2 + KMOV %k2, %VRCX + bsf %rcx, %r8 + + cmpq $(CHAR_PER_VEC), %r8 + jbe L(end_vec) + + /* It is in the fourth vector. */ + subq $(CHAR_PER_VEC), %rax + movq %rax, %r8 + KMOV %k1, %VRCX + bsf %rcx, %r8 + + P2ALIGN_CLAMPED(4, 3) +L(end_vec): + /* Get the number that has been processed. */ + movq %rsi, %rcx + subq %rax, %rcx + + /* Add that to the offset we found the null terminator at. */ + leaq (%r8, %rcx), %rax + + /* Take the min of that and the limit. */ + cmpq %rsi, %rax + cmovnb %rsi, %rax + ret + + P2ALIGN_CLAMPED(4, 11) +L(crosses_page_boundary): + /* Align data backwards to VEC_SIZE. */ + shrl $20, %eax + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + VPCMPEQ (%rcx), %VZERO, %k0 + + KMOV %k0, %VRCX +#ifdef USE_AS_WCSLEN + shrl $2, %eax + andl $(CHAR_PER_VEC - 1), %eax +#endif + /* By this point rax contains number of bytes we need to skip. */ + shrx %VRAX, %VRCX, %VRCX + + /* Calculates CHAR_PER_VEC - eax and stores in eax. */ + negl %eax + andl $(CHAR_PER_VEC - 1), %eax + + movq %rsi, %rdx + bsf %VRCX, %VRDX + cmpq %rax, %rdx + ja L(cross_page_continue) + + /* The vector had a null terminator or we are at the limit. */ + movl %edx, %eax + cmpq %rdx, %rsi + cmovb %esi, %eax + ret + +END(STRNLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S index 91b16830eb..cba585452c 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -1,423 +1,7 @@ -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. - Copyright (C) 2022-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <isa-level.h> -#include <sysdep.h> - -#if ISA_SHOULD_BUILD (4) - -# ifndef VEC_SIZE -# include "x86-evex256-vecs.h" -# endif - - -# ifndef STRNLEN -# define STRNLEN __strnlen_evex -# endif - -# ifdef USE_AS_WCSLEN -# define VPCMPEQ vpcmpeqd -# define VPCMPNEQ vpcmpneqd -# define VPTESTN vptestnmd -# define VPTEST vptestmd -# define VPMINU vpminud -# define CHAR_SIZE 4 - -# else -# define VPCMPEQ vpcmpeqb -# define VPCMPNEQ vpcmpneqb -# define VPTESTN vptestnmb -# define VPTEST vptestmb -# define VPMINU vpminub -# define CHAR_SIZE 1 - -# define REG_WIDTH VEC_SIZE -# endif - -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - -# include "reg-macros.h" - -# if CHAR_PER_VEC == 32 -# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) -# else -# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) -# endif - - - -# if CHAR_PER_VEC == 64 -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) -# else -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) -# endif - - -# define XZERO VMM_128(0) -# define VZERO VMM(0) -# define PAGE_SIZE 4096 - - .section SECTION(.text), "ax", @progbits -ENTRY_P2ALIGN (STRNLEN, 6) - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(zero) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - - movl %edi, %eax - vpxorq %XZERO, %XZERO, %XZERO - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page_boundary) - - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a - null byte. */ - VPCMPEQ (%rdi), %VZERO, %k0 - - KMOV %k0, %VRCX - movq %rsi, %rax - - /* If src (rcx) is zero, bsf does not change the result. NB: - Must use 64-bit bsf here so that upper bits of len are not - cleared. */ - bsfq %rcx, %rax - /* If rax > CHAR_PER_VEC then rcx must have been zero (no null - CHAR) and rsi must be > CHAR_PER_VEC. */ - cmpq $CHAR_PER_VEC, %rax - ja L(more_1x_vec) - /* Check if first match in bounds. */ - cmpq %rax, %rsi - cmovb %esi, %eax - ret - - -# if CHAR_PER_VEC != 32 - .p2align 4,, 2 -L(zero): -L(max_0): - movl %esi, %eax - ret -# endif - - /* Aligned more for strnlen compares remaining length vs 2 * - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before - going to the loop. */ - .p2align 4,, 10 -L(more_1x_vec): -L(cross_page_continue): - /* Compute number of words checked after aligning. */ -# ifdef USE_AS_WCSLEN - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can - overflow. */ - movq %rdi, %rax - andq $(VEC_SIZE * -1), %rdi - subq %rdi, %rax - sarq $2, %rax - leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax -# else - leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax - andq $(VEC_SIZE * -1), %rdi - subq %rdi, %rax -# endif - - - VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 - - cmpq $(CHAR_PER_VEC * 2), %rax - ja L(more_2x_vec) - -L(last_2x_vec_or_less): - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_check) - - /* Check the end of data. */ - SUB_SHORT (CHAR_PER_VEC, rax) - jbe L(max_0) - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jz L(max_0) - /* Best place for LAST_VEC_CHECK if ZMM. */ - .p2align 4,, 8 -L(last_vec_check): - bsf %VRDX, %VRDX - sub %eax, %edx - lea (%rsi, %rdx), %eax - cmovae %esi, %eax - ret - -# if CHAR_PER_VEC == 32 - .p2align 4,, 2 -L(zero): -L(max_0): - movl %esi, %eax - ret -# endif - - .p2align 4,, 8 -L(last_4x_vec_or_less): - addl $(CHAR_PER_VEC * -4), %eax - VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 - subq $(VEC_SIZE * -4), %rdi - cmpl $(CHAR_PER_VEC * 2), %eax - jbe L(last_2x_vec_or_less) - - .p2align 4,, 6 -L(more_2x_vec): - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without - rechecking bounds. */ - - KMOV %k0, %VRDX - - test %VRDX, %VRDX - jnz L(first_vec_x1) - - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x2) - - cmpq $(CHAR_PER_VEC * 4), %rax - ja L(more_4x_vec) - - - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - addl $(CHAR_PER_VEC * -2), %eax - test %VRDX, %VRDX - jnz L(last_vec_check) - - subl $(CHAR_PER_VEC), %eax - jbe L(max_1) - - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - - test %VRDX, %VRDX - jnz L(last_vec_check) -L(max_1): - movl %esi, %eax - ret - - .p2align 4,, 3 -L(first_vec_x2): -# if VEC_SIZE == 64 - /* If VEC_SIZE == 64 we can fit logic for full return label in - spare bytes before next cache line. */ - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax - ret - .p2align 4,, 6 -# else - addl $CHAR_PER_VEC, %esi -# endif -L(first_vec_x1): - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax - ret - - - .p2align 4,, 6 -L(first_vec_x4): -# if VEC_SIZE == 64 - /* If VEC_SIZE == 64 we can fit logic for full return label in - spare bytes before next cache line. */ - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax - ret - .p2align 4,, 6 -# else - addl $CHAR_PER_VEC, %esi -# endif -L(first_vec_x3): - bsf %VRDX, %VRDX - sub %eax, %esi - leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax - ret - - .p2align 4,, 5 -L(more_4x_vec): - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x3) - - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(first_vec_x4) - - /* Check if at last VEC_SIZE * 4 length before aligning for the - loop. */ - cmpq $(CHAR_PER_VEC * 8), %rax - jbe L(last_4x_vec_or_less) - - - /* Compute number of words checked after aligning. */ -# ifdef USE_AS_WCSLEN - /* Need to compute directly for wcslen as CHAR_SIZE * rsi can - overflow. */ - leaq (VEC_SIZE * -3)(%rdi), %rdx -# else - leaq (VEC_SIZE * -3)(%rdi, %rax), %rax -# endif - - subq $(VEC_SIZE * -1), %rdi - - /* Align data to VEC_SIZE * 4. */ -# if VEC_SIZE == 64 - /* Saves code size. No evex512 processor has partial register - stalls. If that change this can be replaced with `andq - $-(VEC_SIZE * 4), %rdi`. */ - xorb %dil, %dil -# else - andq $-(VEC_SIZE * 4), %rdi -# endif - -# ifdef USE_AS_WCSLEN - subq %rdi, %rdx - sarq $2, %rdx - addq %rdx, %rax -# else - subq %rdi, %rax -# endif - /* Compare 4 * VEC at a time forward. */ - .p2align 4,, 11 -L(loop_4x_vec): - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) - VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k2 - subq $-(VEC_SIZE * 4), %rdi - /* Break if at end of length. */ - subq $(CHAR_PER_VEC * 4), %rax - jbe L(loop_len_end) - - - KORTEST %k0, %k2 - jz L(loop_4x_vec) - - -L(loop_last_4x_vec): - movq %rsi, %rcx - subq %rax, %rsi - VPTESTN %VMM(1), %VMM(1), %k1 - KMOV %k1, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x0) - - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x1) - - VPTESTN %VMM(3), %VMM(3), %k0 - - /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for - returning last 2x VEC. For VEC_SIZE == 64 we test each VEC - individually, for VEC_SIZE == 32 we combine them in a single - 64-bit GPR. */ -# if CHAR_PER_VEC == 64 - KMOV %k0, %VRDX - test %VRDX, %VRDX - jnz L(last_vec_x2) - KMOV %k2, %VRDX -# else - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. - */ - kmovd %k2, %edx - kmovd %k0, %eax - salq $CHAR_PER_VEC, %rdx - orq %rax, %rdx -# endif - - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. - */ - bsfq %rdx, %rdx - leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret - - /* Handle last 4x VEC after loop. All VECs have been loaded. */ - .p2align 4,, 4 -L(loop_len_end): - KORTEST %k0, %k2 - jnz L(loop_last_4x_vec) - movq %rsi, %rax - ret - - -# if CHAR_PER_VEC == 64 - /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 - need return label for it. */ - .p2align 4,, 8 -L(last_vec_x2): - bsf %VRDX, %VRDX - leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret -# endif - - - .p2align 4,, 10 -L(last_vec_x1): - addq $CHAR_PER_VEC, %rsi -L(last_vec_x0): - bsf %VRDX, %VRDX - leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax - cmpq %rax, %rcx - cmovb %rcx, %rax - ret - - - .p2align 4,, 8 -L(cross_page_boundary): - /* Align data to VEC_SIZE. */ - movq %rdi, %rcx - andq $-VEC_SIZE, %rcx - VPCMPEQ (%rcx), %VZERO, %k0 - - KMOV %k0, %VRCX -# ifdef USE_AS_WCSLEN - shrl $2, %eax - andl $(CHAR_PER_VEC - 1), %eax -# endif - shrx %VRAX, %VRCX, %VRCX - - negl %eax - andl $(CHAR_PER_VEC - 1), %eax - movq %rsi, %rdx - bsf %VRCX, %VRDX - cmpq %rax, %rdx - ja L(cross_page_continue) - movl %edx, %eax - cmpq %rdx, %rsi - cmovb %esi, %eax - ret -END (STRNLEN) +#ifndef STRNLEN +#define STRNLEN __strnlen_evex #endif + +#include "x86-evex256-vecs.h" +#include "reg-macros.h" +#include "strnlen-evex-base.S" \ No newline at end of file diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S index f8e55883bb..8ef54078f8 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S @@ -1,264 +1,7 @@ -/* Placeholder function, not used by any processor at the moment. - Copyright (C) 2022-2024 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - #ifndef STRNLEN #define STRNLEN __strnlen_evex512 #endif #include "x86-evex512-vecs.h" #include "reg-macros.h" - -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (4) - -# include <sysdep.h> - -# ifdef USE_AS_WCSLEN -# define VPCMPEQ vpcmpeqd -# define VPTESTN vptestnmd -# define VPMINU vpminud -# define CHAR_SIZE 4 -# else -# define VPCMPEQ vpcmpeqb -# define VPTESTN vptestnmb -# define VPMINU vpminub -# define CHAR_SIZE 1 -# endif - -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section SECTION(.text),"ax",@progbits -/* Aligning entry point to 64 byte, provides better performance for - one vector length string. */ -ENTRY_P2ALIGN (STRNLEN, 6) - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(ret_max) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - - movl %edi, %eax - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - sall $20, %eax - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax - ja L(page_cross) - - /* Compare [w]char for null, mask bit will be set for match. */ - VPCMPEQ (%rdi), %VMM(0), %k0 - KMOV %k0, %VRCX - /* Store max length in rax. */ - mov %rsi, %rax - /* If rcx is 0, rax will have max length. We can not use VRCX - and VRAX here for evex256 because, upper 32 bits may be - undefined for ecx and eax. */ - bsfq %rcx, %rax - cmp $CHAR_PER_VEC, %rax - ja L(align_more) - cmpq %rax, %rsi - cmovb %esi, %eax - ret - - /* At this point vector max length reached. */ - .p2align 4,,3 -L(ret_max): - movq %rsi, %rax - ret - -L(align_more): - mov %rdi, %rax - /* Align rax to VEC_SIZE. */ - andq $-VEC_SIZE, %rax - movq %rdi, %rdx - subq %rax, %rdx -# ifdef USE_AS_WCSLEN - shr $2, %VRDX -# endif - /* At this point rdx contains [w]chars already compared. */ - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx - /* At this point rdx contains number of w[char] needs to go. - Now onwards rdx will keep decrementing with each compare. */ - - /* Loop unroll 4 times for 4 vector loop. */ - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - subq $-VEC_SIZE, %rax - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x4) - - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - /* Save pointer before 4 x VEC_SIZE alignment. */ - movq %rax, %rcx - - /* Align address to VEC_SIZE * 4 for loop. */ - andq $-(VEC_SIZE * 4), %rax - - subq %rax, %rcx -# ifdef USE_AS_WCSLEN - shr $2, %VRCX -# endif - /* rcx contains number of [w]char will be recompared due to - alignment fixes. rdx must be incremented by rcx to offset - alignment adjustment. */ - addq %rcx, %rdx - /* Need jump as we don't want to add/subtract rdx for first - iteration of 4 x VEC_SIZE aligned loop. */ - - .p2align 4,,11 -L(loop): - /* VPMINU and VPCMP combination provide better performance as - compared to alternative combinations. */ - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) - - VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k1 - - subq $-(VEC_SIZE * 4), %rax - KORTEST %k0, %k1 - - jnz L(loopend) - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop) - mov %rsi, %rax - ret - -L(loopend): - - VPTESTN %VMM(1), %VMM(1), %k2 - KMOV %k2, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - KMOV %k0, %VRCX - /* At this point, if k0 is non zero, null char must be in the - second vector. */ - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - VPTESTN %VMM(3), %VMM(3), %k3 - KMOV %k3, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - /* At this point null [w]char must be in the fourth vector so no - need to check. */ - KMOV %k1, %VRCX - - /* Fourth, third, second vector terminating are pretty much - same, implemented this way to avoid branching and reuse code - from pre loop exit condition. */ -L(ret_vec_x4): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 3), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax -# endif - - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(ret_vec_x3): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 2), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax -# endif - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(ret_vec_x2): - subq $-VEC_SIZE, %rax -L(ret_vec_x1): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - addq %rcx, %rax - cmpq %rsi, %rax - cmovnb %rsi, %rax - ret - -L(page_cross): - mov %rdi, %rax - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx -# ifdef USE_AS_WCSLEN - sarl $2, %ecx -# endif - /* ecx contains number of w[char] to be skipped as a result - of address alignment. */ - andq $-VEC_SIZE, %rax - VPCMPEQ (%rax), %VMM(0), %k0 - KMOV %k0, %VRDX - /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRDX - jnz L(page_cross_end) - movl $CHAR_PER_VEC, %eax - sub %ecx, %eax - cmp %rax, %rsi - ja L(align_more) - -L(page_cross_end): - bsf %VRDX, %VRAX - cmpq %rsi, %rax - cmovnb %esi, %eax - ret - -END (STRNLEN) -#endif +#include "strnlen-evex-base.S" \ No newline at end of file