Message ID | 20231214223805.853145-1-matthew.sterrett@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2] x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations. | expand |
On Thu, Dec 14, 2023 at 4:37 PM Matthew Sterrett <matthew.sterrett@intel.com> wrote: > > This commit uses a common implementation 'strlen-evex-base.S' for both > 'strlen-evex' and 'strlen-evex512' > > The motivation is to reduce the number of implementations to maintain. > This incidentally gives a small performance improvement. > > All tests pass on x86. > > Benchmarks were taken on SKX. > https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html > > Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939 > Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965 > > Code Size Changes: > strlen-evex512.S : +24 bytes > wcslen-evex512.S : +54 bytes > --- > sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------ > sysdeps/x86_64/multiarch/strlen-evex.S | 250 +------------ > sysdeps/x86_64/multiarch/strnlen-evex512.S | 266 +++++++++++++- > sysdeps/x86_64/multiarch/wcslen-evex512.S | 6 +- > sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 9 +- > 5 files changed, 439 insertions(+), 472 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S > index 7305b24e28..6ea9e85aa0 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S > @@ -1,5 +1,5 @@ > -/* Placeholder function, not used by any processor at the moment. > - Copyright (C) 2022-2023 Free Software Foundation, Inc. > +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. 256/512 bit EVEX.... > + Copyright (C) 2021-2023 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,7 +16,6 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -/* UNUSED. Exists purely as reference implementation. */ > > #include <isa-level.h> > > @@ -26,272 +25,211 @@ > > # ifdef USE_AS_WCSLEN > # define VPCMPEQ vpcmpeqd > +# define VPCMPNEQ vpcmpneqd > # define VPTESTN vptestnmd > +# define VPTEST vptestmd > # define VPMINU vpminud > # define CHAR_SIZE 4 > +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg > # else > # define VPCMPEQ vpcmpeqb > +# define VPCMPNEQ vpcmpneqb > # define VPTESTN vptestnmb > +# define VPTEST vptestmb > # define VPMINU vpminub > # define CHAR_SIZE 1 > +# define CHAR_SIZE_SHIFT_REG(reg) > + > +# define REG_WIDTH VEC_SIZE > # endif > > -# define PAGE_SIZE 4096 > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > - .section SECTION(.text),"ax",@progbits > -/* Aligning entry point to 64 byte, provides better performance for > - one vector length string. */ > -ENTRY_P2ALIGN (STRLEN, 6) > -# ifdef USE_AS_STRNLEN > - /* Check zero length. */ > - test %RSI_LP, %RSI_LP > - jz L(ret_max) > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %esi, %esi > -# endif > +# include "reg-macros.h" > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > # endif > > +# define XZERO VMM_128(0) > +# define VZERO VMM(0) > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN(STRLEN, 6) > movl %edi, %eax > - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > - sall $20, %eax > - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > - ja L(page_cross) > - > - /* Compare [w]char for null, mask bit will be set for match. */ > - VPCMPEQ (%rdi), %VMM(0), %k0 > -# ifdef USE_AS_STRNLEN > - KMOV %k0, %VRCX > - /* Store max length in rax. */ > - mov %rsi, %rax > - /* If rcx is 0, rax will have max length. We can not use VRCX > - and VRAX here for evex256 because, upper 32 bits may be > - undefined for ecx and eax. */ > - bsfq %rcx, %rax > - cmp $CHAR_PER_VEC, %rax > - ja L(align_more) > - cmpq %rax, %rsi > - cmovb %esi, %eax > -# else > + vpxorq %XZERO, %XZERO, %XZERO > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > + > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > + null byte. */ > + VPCMPEQ (%rdi), %VZERO, %k0 > KMOV %k0, %VRAX > test %VRAX, %VRAX > - jz L(align_more) > + jz L(aligned_more) > bsf %VRAX, %VRAX > -# endif > ret > > - /* At this point vector max length reached. */ > -# ifdef USE_AS_STRNLEN > - .p2align 4,,3 > -L(ret_max): > - movq %rsi, %rax > + .p2align 4,, 8 > +L(first_vec_x4): > + bsf %VRAX, %VRAX > + subl %ecx, %edi > + CHAR_SIZE_SHIFT_REG (edi) > + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > ret > -# endif > > -L(align_more): > - mov %rdi, %rax > - /* Align rax to VEC_SIZE. */ > - andq $-VEC_SIZE, %rax > -# ifdef USE_AS_STRNLEN > - movq %rdi, %rdx > - subq %rax, %rdx > -# ifdef USE_AS_WCSLEN > - shr $2, %VRDX > -# endif > - /* At this point rdx contains [w]chars already compared. */ > - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx > - /* At this point rdx contains number of w[char] needs to go. > - Now onwards rdx will keep decrementing with each compare. */ > -# endif > - > - /* Loop unroll 4 times for 4 vector loop. */ > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > - subq $-VEC_SIZE, %rax > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x1) > > -# ifdef USE_AS_STRNLEN > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > -# endif > > - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x2) > + /* Aligned more for strnlen compares remaining length vs 2 * > + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > + going to the loop. */ > + .p2align 4,, 10 > +L(aligned_more): > + movq %rdi, %rcx > + andq $(VEC_SIZE * -1), %rdi > +L(cross_page_continue): > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > > -# ifdef USE_AS_STRNLEN > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > -# endif > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > > - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x3) > > -# ifdef USE_AS_STRNLEN > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > -# endif > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x4) > > - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > - KMOV %k0, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x4) > + subq $(VEC_SIZE * -1), %rdi > > -# ifdef USE_AS_STRNLEN > - subq $CHAR_PER_VEC, %rdx > - jbe L(ret_max) > - /* Save pointer before 4 x VEC_SIZE alignment. */ > - movq %rax, %rcx > +# if CHAR_PER_VEC == 64 > + /* No partial register stalls on processors that we use evex512 > + on and this saves code size. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > # endif > > - /* Align address to VEC_SIZE * 4 for loop. */ > - andq $-(VEC_SIZE * 4), %rax > - > -# ifdef USE_AS_STRNLEN > - subq %rax, %rcx > -# ifdef USE_AS_WCSLEN > - shr $2, %VRCX > -# endif > - /* rcx contains number of [w]char will be recompared due to > - alignment fixes. rdx must be incremented by rcx to offset > - alignment adjustment. */ > - addq %rcx, %rdx > - /* Need jump as we don't want to add/subtract rdx for first > - iteration of 4 x VEC_SIZE aligned loop. */ > -# endif > > - .p2align 4,,11 > -L(loop): > - /* VPMINU and VPCMP combination provide better performance as > - compared to alternative combinations. */ > - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > > + /* Compare 4 * VEC at a time forward. */ > + .p2align 4 > +L(loop_4x_vec): > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > VPTESTN %VMM(2), %VMM(2), %k0 > - VPTESTN %VMM(4), %VMM(4), %k1 > + VPTESTN %VMM(4), %VMM(4), %k2 > > - subq $-(VEC_SIZE * 4), %rax > - KORTEST %k0, %k1 > + subq $-(VEC_SIZE * 4), %rdi > + KORTEST %k0, %k2 > + jz L(loop_4x_vec) > > -# ifndef USE_AS_STRNLEN > - jz L(loop) > + VPTESTN %VMM(1), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > + > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > + > + VPTESTN %VMM(3), %VMM(3), %k0 > + > +# if CHAR_PER_VEC == 64 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k2, %VRAX > # else > - jnz L(loopend) > - subq $(CHAR_PER_VEC * 4), %rdx > - ja L(loop) > - mov %rsi, %rax > + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */ > + kmovd %k2, %edx > + kmovd %k0, %eax > + salq $CHAR_PER_VEC, %rdx > + orq %rdx, %rax > +# endif > + > + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */ > + .p2align 4,, 2 > +L(FALLTHROUGH_RETURN_LBL): > + bsfq %rax, %rax > + subq %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > ret > -# endif > > -L(loopend): > - > - VPTESTN %VMM(1), %VMM(1), %k2 > - KMOV %k2, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x1) > - > - KMOV %k0, %VRCX > - /* At this point, if k0 is non zero, null char must be in the > - second vector. */ > - test %VRCX, %VRCX > - jnz L(ret_vec_x2) > - > - VPTESTN %VMM(3), %VMM(3), %k3 > - KMOV %k3, %VRCX > - test %VRCX, %VRCX > - jnz L(ret_vec_x3) > - /* At this point null [w]char must be in the fourth vector so no > - need to check. */ > - KMOV %k1, %VRCX > - > - /* Fourth, third, second vector terminating are pretty much > - same, implemented this way to avoid branching and reuse code > - from pre loop exit condition. */ > -L(ret_vec_x4): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - subq $-(VEC_SIZE * 3), %rax > - shrq $2, %rax > - addq %rcx, %rax > -# else > - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax > -# endif > -# ifdef USE_AS_STRNLEN > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > -# endif > + .p2align 4,, 8 > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + addq %rdi, %rax > ret > > -L(ret_vec_x3): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - subq $-(VEC_SIZE * 2), %rax > - shrq $2, %rax > - addq %rcx, %rax > -# else > - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax > -# endif > -# ifdef USE_AS_STRNLEN > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > -# endif > + .p2align 4,, 10 > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > -L(ret_vec_x2): > - subq $-VEC_SIZE, %rax > -L(ret_vec_x1): > - bsf %VRCX, %VRCX > - subq %rdi, %rax > -# ifdef USE_AS_WCSLEN > - shrq $2, %rax > -# endif > - addq %rcx, %rax > -# ifdef USE_AS_STRNLEN > - cmpq %rsi, %rax > - cmovnb %rsi, %rax > -# endif > + .p2align 4,, 10 > + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */ > +L(TAIL_RETURN_LBL): > + bsf %VRAX, %VRAX > + sub %VRCX, %VRDI > + CHAR_SIZE_SHIFT_REG (VRDI) > + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX > ret > > -L(page_cross): > - mov %rdi, %rax > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > + .p2align 4,, 8 > +L(cross_page_boundary): > + movq %rdi, %rcx > + /* Align data to VEC_SIZE. */ > + andq $-VEC_SIZE, %rdi > + > + VPCMPEQ (%rdi), %VZERO, %k0 > + > + KMOV %k0, %VRAX > # ifdef USE_AS_WCSLEN > - sarl $2, %ecx > -# endif > - /* ecx contains number of w[char] to be skipped as a result > - of address alignment. */ > - andq $-VEC_SIZE, %rax > - VPCMPEQ (%rax), %VMM(0), %k0 > - KMOV %k0, %VRDX > - /* Ignore number of character for alignment adjustment. */ > - shr %cl, %VRDX > -# ifdef USE_AS_STRNLEN > - jnz L(page_cross_end) > - movl $CHAR_PER_VEC, %eax > - sub %ecx, %eax > - cmp %rax, %rsi > - ja L(align_more) > + movl %ecx, %edx > + shrl $2, %edx > + andl $(CHAR_PER_VEC - 1), %edx > + shrx %edx, %eax, %eax > + testl %eax, %eax > # else > - jz L(align_more) > -# endif > - > -L(page_cross_end): > - bsf %VRDX, %VRAX > -# ifdef USE_AS_STRNLEN > - cmpq %rsi, %rax > - cmovnb %esi, %eax > + shr %cl, %VRAX > # endif > + jz L(cross_page_continue) > + bsf %VRAX, %VRAX > ret > > -END (STRLEN) > +END(STRLEN) > #endif > diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S > index 364eeffff6..93ad15e356 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex.S > @@ -1,245 +1,7 @@ > -/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. > - Copyright (C) 2021-2023 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <isa-level.h> > - > -#if ISA_SHOULD_BUILD (4) > - > -# include <sysdep.h> > - > -# ifndef STRLEN > -# define STRLEN __strlen_evex > -# endif > - > -# ifndef VEC_SIZE > -# include "x86-evex256-vecs.h" > -# endif > - > -# ifdef USE_AS_WCSLEN > -# define VPCMPEQ vpcmpeqd > -# define VPCMPNEQ vpcmpneqd > -# define VPTESTN vptestnmd > -# define VPTEST vptestmd > -# define VPMINU vpminud > -# define CHAR_SIZE 4 > -# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg > -# else > -# define VPCMPEQ vpcmpeqb > -# define VPCMPNEQ vpcmpneqb > -# define VPTESTN vptestnmb > -# define VPTEST vptestmb > -# define VPMINU vpminub > -# define CHAR_SIZE 1 > -# define CHAR_SIZE_SHIFT_REG(reg) > - > -# define REG_WIDTH VEC_SIZE > -# endif > - > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > -# include "reg-macros.h" > - > -# if CHAR_PER_VEC == 64 > - > -# define TAIL_RETURN_LBL first_vec_x2 > -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > - > -# define FALLTHROUGH_RETURN_LBL first_vec_x3 > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > - > -# else > - > -# define TAIL_RETURN_LBL first_vec_x3 > -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > - > -# define FALLTHROUGH_RETURN_LBL first_vec_x2 > -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > -# endif > - > -# define XZERO VMM_128(0) > -# define VZERO VMM(0) > -# define PAGE_SIZE 4096 > - > - .section SECTION(.text), "ax", @progbits > -ENTRY_P2ALIGN (STRLEN, 6) > - movl %edi, %eax > - vpxorq %XZERO, %XZERO, %XZERO > - andl $(PAGE_SIZE - 1), %eax > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page_boundary) > - > - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > - null byte. */ > - VPCMPEQ (%rdi), %VZERO, %k0 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jz L(aligned_more) > - bsf %VRAX, %VRAX > - ret > - > - .p2align 4,, 8 > -L(first_vec_x4): > - bsf %VRAX, %VRAX > - subl %ecx, %edi > - CHAR_SIZE_SHIFT_REG (edi) > - leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > - ret > - > - > - > - /* Aligned more for strnlen compares remaining length vs 2 * > - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > - going to the loop. */ > - .p2align 4,, 10 > -L(aligned_more): > - movq %rdi, %rcx > - andq $(VEC_SIZE * -1), %rdi > -L(cross_page_continue): > - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > - rechecking bounds. */ > - VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x1) > - > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x2) > - > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x3) > - > - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x4) > - > - subq $(VEC_SIZE * -1), %rdi > - > -# if CHAR_PER_VEC == 64 > - /* No partial register stalls on processors that we use evex512 > - on and this saves code size. */ > - xorb %dil, %dil > -# else > - andq $-(VEC_SIZE * 4), %rdi > -# endif > - > - > - > - /* Compare 4 * VEC at a time forward. */ > - .p2align 4 > -L(loop_4x_vec): > - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > - VPTESTN %VMM(2), %VMM(2), %k0 > - VPTESTN %VMM(4), %VMM(4), %k2 > - > - subq $-(VEC_SIZE * 4), %rdi > - KORTEST %k0, %k2 > - jz L(loop_4x_vec) > - > - VPTESTN %VMM(1), %VMM(1), %k1 > - KMOV %k1, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x0) > - > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x1) > - > - VPTESTN %VMM(3), %VMM(3), %k0 > - > -# if CHAR_PER_VEC == 64 > - KMOV %k0, %VRAX > - test %VRAX, %VRAX > - jnz L(first_vec_x2) > - KMOV %k2, %VRAX > -# else > - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > - */ > - kmovd %k2, %edx > - kmovd %k0, %eax > - salq $CHAR_PER_VEC, %rdx > - orq %rdx, %rax > -# endif > - > - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > - */ > - .p2align 4,, 2 > -L(FALLTHROUGH_RETURN_LBL): > - bsfq %rax, %rax > - subq %rcx, %rdi > - CHAR_SIZE_SHIFT_REG (rdi) > - leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > - ret > - > - .p2align 4,, 8 > -L(first_vec_x0): > - bsf %VRAX, %VRAX > - sub %rcx, %rdi > - CHAR_SIZE_SHIFT_REG (rdi) > - addq %rdi, %rax > - ret > - > - .p2align 4,, 10 > -L(first_vec_x1): > - bsf %VRAX, %VRAX > - sub %rcx, %rdi > - CHAR_SIZE_SHIFT_REG (rdi) > - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > - ret > - > - .p2align 4,, 10 > - /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. > - */ > -L(TAIL_RETURN_LBL): > - bsf %VRAX, %VRAX > - sub %VRCX, %VRDI > - CHAR_SIZE_SHIFT_REG (VRDI) > - lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX > - ret > - > - .p2align 4,, 8 > -L(cross_page_boundary): > - movq %rdi, %rcx > - /* Align data to VEC_SIZE. */ > - andq $-VEC_SIZE, %rdi > - > - VPCMPEQ (%rdi), %VZERO, %k0 > - > - KMOV %k0, %VRAX > -# ifdef USE_AS_WCSLEN > - movl %ecx, %edx > - shrl $2, %edx > - andl $(CHAR_PER_VEC - 1), %edx > - shrx %edx, %eax, %eax > - testl %eax, %eax > -# else > - shr %cl, %VRAX > -# endif > - jz L(cross_page_continue) > - bsf %VRAX, %VRAX > - ret > - > -END (STRLEN) > +#ifndef STRLEN > +# define STRLEN __strlen_evex > #endif > + > +#include "x86-evex256-vecs.h" > +#include "reg-macros.h" > +#include "strlen-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S > index 0b7f220214..ebf22c259f 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S > @@ -1,4 +1,264 @@ > -#define STRLEN __strnlen_evex512 > -#define USE_AS_STRNLEN 1 > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022-2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > > -#include "strlen-evex512.S" > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#ifndef STRNLEN > +#define STRNLEN __strnlen_evex512 > +#endif > + > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WCSLEN > +# define VPCMPEQ vpcmpeqd > +# define VPTESTN vptestnmd > +# define VPMINU vpminud > +# define CHAR_SIZE 4 > +# else > +# define VPCMPEQ vpcmpeqb > +# define VPTESTN vptestnmb > +# define VPMINU vpminub > +# define CHAR_SIZE 1 > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text),"ax",@progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (STRNLEN, 6) > + /* Check zero length. */ > + test %RSI_LP, %RSI_LP > + jz L(ret_max) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %esi, %esi > +# endif > + > + movl %edi, %eax > + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > + sall $20, %eax > + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VPCMPEQ (%rdi), %VMM(0), %k0 > + KMOV %k0, %VRCX > + /* Store max length in rax. */ > + mov %rsi, %rax > + /* If rcx is 0, rax will have max length. We can not use VRCX > + and VRAX here for evex256 because, upper 32 bits may be > + undefined for ecx and eax. */ > + bsfq %rcx, %rax > + cmp $CHAR_PER_VEC, %rax > + ja L(align_more) > + cmpq %rax, %rsi > + cmovb %esi, %eax > + ret > + > + /* At this point vector max length reached. */ > + .p2align 4,,3 > +L(ret_max): > + movq %rsi, %rax > + ret > + > +L(align_more): > + mov %rdi, %rax > + /* Align rax to VEC_SIZE. */ > + andq $-VEC_SIZE, %rax > + movq %rdi, %rdx > + subq %rax, %rdx > +# ifdef USE_AS_WCSLEN > + shr $2, %VRDX > +# endif > + /* At this point rdx contains [w]chars already compared. */ > + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx > + /* At this point rdx contains number of w[char] needs to go. > + Now onwards rdx will keep decrementing with each compare. */ > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > + subq $-VEC_SIZE, %rax > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x1) > + > + subq $CHAR_PER_VEC, %rdx > + jbe L(ret_max) > + > + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + > + subq $CHAR_PER_VEC, %rdx > + jbe L(ret_max) > + > + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x3) > + > + subq $CHAR_PER_VEC, %rdx > + jbe L(ret_max) > + > + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x4) > + > + subq $CHAR_PER_VEC, %rdx > + jbe L(ret_max) > + /* Save pointer before 4 x VEC_SIZE alignment. */ > + movq %rax, %rcx > + > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rax > + > + subq %rax, %rcx > +# ifdef USE_AS_WCSLEN > + shr $2, %VRCX > +# endif > + /* rcx contains number of [w]char will be recompared due to > + alignment fixes. rdx must be incremented by rcx to offset > + alignment adjustment. */ > + addq %rcx, %rdx > + /* Need jump as we don't want to add/subtract rdx for first > + iteration of 4 x VEC_SIZE aligned loop. */ > + > + .p2align 4,,11 > +L(loop): > + /* VPMINU and VPCMP combination provide better performance as > + compared to alternative combinations. */ > + VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k1 > + > + subq $-(VEC_SIZE * 4), %rax > + KORTEST %k0, %k1 > + > + jnz L(loopend) > + subq $(CHAR_PER_VEC * 4), %rdx > + ja L(loop) > + mov %rsi, %rax > + ret > + > +L(loopend): > + > + VPTESTN %VMM(1), %VMM(1), %k2 > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x1) > + > + KMOV %k0, %VRCX > + /* At this point, if k0 is non zero, null char must be in the > + second vector. */ > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + > + VPTESTN %VMM(3), %VMM(3), %k3 > + KMOV %k3, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x3) > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k1, %VRCX > + > + /* Fourth, third, second vector terminating are pretty much > + same, implemented this way to avoid branching and reuse code > + from pre loop exit condition. */ > +L(ret_vec_x4): > + bsf %VRCX, %VRCX > + subq %rdi, %rax > +# ifdef USE_AS_WCSLEN > + subq $-(VEC_SIZE * 3), %rax > + shrq $2, %rax > + addq %rcx, %rax > +# else > + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax > +# endif > + > + cmpq %rsi, %rax > + cmovnb %rsi, %rax > + ret > + > +L(ret_vec_x3): > + bsf %VRCX, %VRCX > + subq %rdi, %rax > +# ifdef USE_AS_WCSLEN > + subq $-(VEC_SIZE * 2), %rax > + shrq $2, %rax > + addq %rcx, %rax > +# else > + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax > +# endif > + cmpq %rsi, %rax > + cmovnb %rsi, %rax > + ret > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rax > +L(ret_vec_x1): > + bsf %VRCX, %VRCX > + subq %rdi, %rax > +# ifdef USE_AS_WCSLEN > + shrq $2, %rax > +# endif > + addq %rcx, %rax > + cmpq %rsi, %rax > + cmovnb %rsi, %rax > + ret > + > +L(page_cross): > + mov %rdi, %rax > + movl %edi, %ecx > + andl $(VEC_SIZE - 1), %ecx > +# ifdef USE_AS_WCSLEN > + sarl $2, %ecx > +# endif > + /* ecx contains number of w[char] to be skipped as a result > + of address alignment. */ > + andq $-VEC_SIZE, %rax > + VPCMPEQ (%rax), %VMM(0), %k0 > + KMOV %k0, %VRDX > + /* Ignore number of character for alignment adjustment. */ > + shr %cl, %VRDX > + jnz L(page_cross_end) > + movl $CHAR_PER_VEC, %eax > + sub %ecx, %eax > + cmp %rax, %rsi > + ja L(align_more) > + > +L(page_cross_end): > + bsf %VRDX, %VRAX > + cmpq %rsi, %rax > + cmovnb %esi, %eax > + ret > + > +END (STRNLEN) > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S > index f59c372b78..aff288a66b 100644 > --- a/sysdeps/x86_64/multiarch/wcslen-evex512.S > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S > @@ -1,4 +1,8 @@ > -#define STRLEN __wcslen_evex512 > +#ifndef WCSLEN > +# define WCSLEN __wcslen_evex512 > +#endif > + > +#define STRLEN WCSLEN > #define USE_AS_WCSLEN 1 > > #include "strlen-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S > index 73dcf2f210..1c37d74fc9 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S > @@ -1,5 +1,8 @@ > -#define STRLEN __wcsnlen_evex512 > +#ifndef WCSNLEN > +# define WCSNLEN __wcsnlen_evex512 > +#endif > + > +#define STRNLEN WCSNLEN > #define USE_AS_WCSLEN 1 > -#define USE_AS_STRNLEN 1 > > -#include "strlen-evex512.S" > +#include "strnlen-evex512.S" > -- > 2.37.2 >
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S index 7305b24e28..6ea9e85aa0 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S @@ -1,5 +1,5 @@ -/* Placeholder function, not used by any processor at the moment. - Copyright (C) 2022-2023 Free Software Foundation, Inc. +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2021-2023 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,7 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -/* UNUSED. Exists purely as reference implementation. */ #include <isa-level.h> @@ -26,272 +25,211 @@ # ifdef USE_AS_WCSLEN # define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd # define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud # define CHAR_SIZE 4 +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg # else # define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb # define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub # define CHAR_SIZE 1 +# define CHAR_SIZE_SHIFT_REG(reg) + +# define REG_WIDTH VEC_SIZE # endif -# define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - .section SECTION(.text),"ax",@progbits -/* Aligning entry point to 64 byte, provides better performance for - one vector length string. */ -ENTRY_P2ALIGN (STRLEN, 6) -# ifdef USE_AS_STRNLEN - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(ret_max) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif +# include "reg-macros.h" + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) # endif +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(STRLEN, 6) movl %edi, %eax - vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) - sall $20, %eax - cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax - ja L(page_cross) - - /* Compare [w]char for null, mask bit will be set for match. */ - VPCMPEQ (%rdi), %VMM(0), %k0 -# ifdef USE_AS_STRNLEN - KMOV %k0, %VRCX - /* Store max length in rax. */ - mov %rsi, %rax - /* If rcx is 0, rax will have max length. We can not use VRCX - and VRAX here for evex256 because, upper 32 bits may be - undefined for ecx and eax. */ - bsfq %rcx, %rax - cmp $CHAR_PER_VEC, %rax - ja L(align_more) - cmpq %rax, %rsi - cmovb %esi, %eax -# else + vpxorq %XZERO, %XZERO, %XZERO + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 KMOV %k0, %VRAX test %VRAX, %VRAX - jz L(align_more) + jz L(aligned_more) bsf %VRAX, %VRAX -# endif ret - /* At this point vector max length reached. */ -# ifdef USE_AS_STRNLEN - .p2align 4,,3 -L(ret_max): - movq %rsi, %rax + .p2align 4,, 8 +L(first_vec_x4): + bsf %VRAX, %VRAX + subl %ecx, %edi + CHAR_SIZE_SHIFT_REG (edi) + leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax ret -# endif -L(align_more): - mov %rdi, %rax - /* Align rax to VEC_SIZE. */ - andq $-VEC_SIZE, %rax -# ifdef USE_AS_STRNLEN - movq %rdi, %rdx - subq %rax, %rdx -# ifdef USE_AS_WCSLEN - shr $2, %VRDX -# endif - /* At this point rdx contains [w]chars already compared. */ - leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx - /* At this point rdx contains number of w[char] needs to go. - Now onwards rdx will keep decrementing with each compare. */ -# endif - - /* Loop unroll 4 times for 4 vector loop. */ - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - subq $-VEC_SIZE, %rax - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif - VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x2) + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 +L(aligned_more): + movq %rdi, %rcx + andq $(VEC_SIZE * -1), %rdi +L(cross_page_continue): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) - VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x3) -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) -# endif + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x4) - VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 - KMOV %k0, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x4) + subq $(VEC_SIZE * -1), %rdi -# ifdef USE_AS_STRNLEN - subq $CHAR_PER_VEC, %rdx - jbe L(ret_max) - /* Save pointer before 4 x VEC_SIZE alignment. */ - movq %rax, %rcx +# if CHAR_PER_VEC == 64 + /* No partial register stalls on processors that we use evex512 + on and this saves code size. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi # endif - /* Align address to VEC_SIZE * 4 for loop. */ - andq $-(VEC_SIZE * 4), %rax - -# ifdef USE_AS_STRNLEN - subq %rax, %rcx -# ifdef USE_AS_WCSLEN - shr $2, %VRCX -# endif - /* rcx contains number of [w]char will be recompared due to - alignment fixes. rdx must be incremented by rcx to offset - alignment adjustment. */ - addq %rcx, %rdx - /* Need jump as we don't want to add/subtract rdx for first - iteration of 4 x VEC_SIZE aligned loop. */ -# endif - .p2align 4,,11 -L(loop): - /* VPMINU and VPCMP combination provide better performance as - compared to alternative combinations. */ - VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) + /* Compare 4 * VEC at a time forward. */ + .p2align 4 +L(loop_4x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k1 + VPTESTN %VMM(4), %VMM(4), %k2 - subq $-(VEC_SIZE * 4), %rax - KORTEST %k0, %k1 + subq $-(VEC_SIZE * 4), %rdi + KORTEST %k0, %k2 + jz L(loop_4x_vec) -# ifndef USE_AS_STRNLEN - jz L(loop) + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) + + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) + + VPTESTN %VMM(3), %VMM(3), %k0 + +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k2, %VRAX # else - jnz L(loopend) - subq $(CHAR_PER_VEC * 4), %rdx - ja L(loop) - mov %rsi, %rax + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rdx, %rax +# endif + + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. */ + .p2align 4,, 2 +L(FALLTHROUGH_RETURN_LBL): + bsfq %rax, %rax + subq %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax ret -# endif -L(loopend): - - VPTESTN %VMM(1), %VMM(1), %k2 - KMOV %k2, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x1) - - KMOV %k0, %VRCX - /* At this point, if k0 is non zero, null char must be in the - second vector. */ - test %VRCX, %VRCX - jnz L(ret_vec_x2) - - VPTESTN %VMM(3), %VMM(3), %k3 - KMOV %k3, %VRCX - test %VRCX, %VRCX - jnz L(ret_vec_x3) - /* At this point null [w]char must be in the fourth vector so no - need to check. */ - KMOV %k1, %VRCX - - /* Fourth, third, second vector terminating are pretty much - same, implemented this way to avoid branching and reuse code - from pre loop exit condition. */ -L(ret_vec_x4): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 3), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 3)(%rcx, %rax), %rax -# endif -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 8 +L(first_vec_x0): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + addq %rdi, %rax ret -L(ret_vec_x3): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - subq $-(VEC_SIZE * 2), %rax - shrq $2, %rax - addq %rcx, %rax -# else - leaq (VEC_SIZE * 2)(%rcx, %rax), %rax -# endif -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 10 +L(first_vec_x1): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret -L(ret_vec_x2): - subq $-VEC_SIZE, %rax -L(ret_vec_x1): - bsf %VRCX, %VRCX - subq %rdi, %rax -# ifdef USE_AS_WCSLEN - shrq $2, %rax -# endif - addq %rcx, %rax -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %rsi, %rax -# endif + .p2align 4,, 10 + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. */ +L(TAIL_RETURN_LBL): + bsf %VRAX, %VRAX + sub %VRCX, %VRDI + CHAR_SIZE_SHIFT_REG (VRDI) + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX ret -L(page_cross): - mov %rdi, %rax - movl %edi, %ecx - andl $(VEC_SIZE - 1), %ecx + .p2align 4,, 8 +L(cross_page_boundary): + movq %rdi, %rcx + /* Align data to VEC_SIZE. */ + andq $-VEC_SIZE, %rdi + + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRAX # ifdef USE_AS_WCSLEN - sarl $2, %ecx -# endif - /* ecx contains number of w[char] to be skipped as a result - of address alignment. */ - andq $-VEC_SIZE, %rax - VPCMPEQ (%rax), %VMM(0), %k0 - KMOV %k0, %VRDX - /* Ignore number of character for alignment adjustment. */ - shr %cl, %VRDX -# ifdef USE_AS_STRNLEN - jnz L(page_cross_end) - movl $CHAR_PER_VEC, %eax - sub %ecx, %eax - cmp %rax, %rsi - ja L(align_more) + movl %ecx, %edx + shrl $2, %edx + andl $(CHAR_PER_VEC - 1), %edx + shrx %edx, %eax, %eax + testl %eax, %eax # else - jz L(align_more) -# endif - -L(page_cross_end): - bsf %VRDX, %VRAX -# ifdef USE_AS_STRNLEN - cmpq %rsi, %rax - cmovnb %esi, %eax + shr %cl, %VRAX # endif + jz L(cross_page_continue) + bsf %VRAX, %VRAX ret -END (STRLEN) +END(STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 364eeffff6..93ad15e356 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -1,245 +1,7 @@ -/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. - Copyright (C) 2021-2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (4) - -# include <sysdep.h> - -# ifndef STRLEN -# define STRLEN __strlen_evex -# endif - -# ifndef VEC_SIZE -# include "x86-evex256-vecs.h" -# endif - -# ifdef USE_AS_WCSLEN -# define VPCMPEQ vpcmpeqd -# define VPCMPNEQ vpcmpneqd -# define VPTESTN vptestnmd -# define VPTEST vptestmd -# define VPMINU vpminud -# define CHAR_SIZE 4 -# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg -# else -# define VPCMPEQ vpcmpeqb -# define VPCMPNEQ vpcmpneqb -# define VPTESTN vptestnmb -# define VPTEST vptestmb -# define VPMINU vpminub -# define CHAR_SIZE 1 -# define CHAR_SIZE_SHIFT_REG(reg) - -# define REG_WIDTH VEC_SIZE -# endif - -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - -# include "reg-macros.h" - -# if CHAR_PER_VEC == 64 - -# define TAIL_RETURN_LBL first_vec_x2 -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) - -# define FALLTHROUGH_RETURN_LBL first_vec_x3 -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) - -# else - -# define TAIL_RETURN_LBL first_vec_x3 -# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) - -# define FALLTHROUGH_RETURN_LBL first_vec_x2 -# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) -# endif - -# define XZERO VMM_128(0) -# define VZERO VMM(0) -# define PAGE_SIZE 4096 - - .section SECTION(.text), "ax", @progbits -ENTRY_P2ALIGN (STRLEN, 6) - movl %edi, %eax - vpxorq %XZERO, %XZERO, %XZERO - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - ja L(cross_page_boundary) - - /* Check the first VEC_SIZE bytes. Each bit in K0 represents a - null byte. */ - VPCMPEQ (%rdi), %VZERO, %k0 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jz L(aligned_more) - bsf %VRAX, %VRAX - ret - - .p2align 4,, 8 -L(first_vec_x4): - bsf %VRAX, %VRAX - subl %ecx, %edi - CHAR_SIZE_SHIFT_REG (edi) - leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax - ret - - - - /* Aligned more for strnlen compares remaining length vs 2 * - CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before - going to the loop. */ - .p2align 4,, 10 -L(aligned_more): - movq %rdi, %rcx - andq $(VEC_SIZE * -1), %rdi -L(cross_page_continue): - /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without - rechecking bounds. */ - VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x1) - - VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x2) - - VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x3) - - VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x4) - - subq $(VEC_SIZE * -1), %rdi - -# if CHAR_PER_VEC == 64 - /* No partial register stalls on processors that we use evex512 - on and this saves code size. */ - xorb %dil, %dil -# else - andq $-(VEC_SIZE * 4), %rdi -# endif - - - - /* Compare 4 * VEC at a time forward. */ - .p2align 4 -L(loop_4x_vec): - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) - VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) - VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) - VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) - VPTESTN %VMM(2), %VMM(2), %k0 - VPTESTN %VMM(4), %VMM(4), %k2 - - subq $-(VEC_SIZE * 4), %rdi - KORTEST %k0, %k2 - jz L(loop_4x_vec) - - VPTESTN %VMM(1), %VMM(1), %k1 - KMOV %k1, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x0) - - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x1) - - VPTESTN %VMM(3), %VMM(3), %k0 - -# if CHAR_PER_VEC == 64 - KMOV %k0, %VRAX - test %VRAX, %VRAX - jnz L(first_vec_x2) - KMOV %k2, %VRAX -# else - /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. - */ - kmovd %k2, %edx - kmovd %k0, %eax - salq $CHAR_PER_VEC, %rdx - orq %rdx, %rax -# endif - - /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. - */ - .p2align 4,, 2 -L(FALLTHROUGH_RETURN_LBL): - bsfq %rax, %rax - subq %rcx, %rdi - CHAR_SIZE_SHIFT_REG (rdi) - leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax - ret - - .p2align 4,, 8 -L(first_vec_x0): - bsf %VRAX, %VRAX - sub %rcx, %rdi - CHAR_SIZE_SHIFT_REG (rdi) - addq %rdi, %rax - ret - - .p2align 4,, 10 -L(first_vec_x1): - bsf %VRAX, %VRAX - sub %rcx, %rdi - CHAR_SIZE_SHIFT_REG (rdi) - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax - ret - - .p2align 4,, 10 - /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. - */ -L(TAIL_RETURN_LBL): - bsf %VRAX, %VRAX - sub %VRCX, %VRDI - CHAR_SIZE_SHIFT_REG (VRDI) - lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX - ret - - .p2align 4,, 8 -L(cross_page_boundary): - movq %rdi, %rcx - /* Align data to VEC_SIZE. */ - andq $-VEC_SIZE, %rdi - - VPCMPEQ (%rdi), %VZERO, %k0 - - KMOV %k0, %VRAX -# ifdef USE_AS_WCSLEN - movl %ecx, %edx - shrl $2, %edx - andl $(CHAR_PER_VEC - 1), %edx - shrx %edx, %eax, %eax - testl %eax, %eax -# else - shr %cl, %VRAX -# endif - jz L(cross_page_continue) - bsf %VRAX, %VRAX - ret - -END (STRLEN) +#ifndef STRLEN +# define STRLEN __strlen_evex #endif + +#include "x86-evex256-vecs.h" +#include "reg-macros.h" +#include "strlen-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S index 0b7f220214..ebf22c259f 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S @@ -1,4 +1,264 @@ -#define STRLEN __strnlen_evex512 -#define USE_AS_STRNLEN 1 +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. -#include "strlen-evex512.S" + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef STRNLEN +#define STRNLEN __strnlen_evex512 +#endif + +#include "x86-evex512-vecs.h" +#include "reg-macros.h" + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPTESTN vptestnmd +# define VPMINU vpminud +# define CHAR_SIZE 4 +# else +# define VPCMPEQ vpcmpeqb +# define VPTESTN vptestnmb +# define VPMINU vpminub +# define CHAR_SIZE 1 +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (STRNLEN, 6) + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(ret_max) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + + movl %edi, %eax + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VPCMPEQ (%rdi), %VMM(0), %k0 + KMOV %k0, %VRCX + /* Store max length in rax. */ + mov %rsi, %rax + /* If rcx is 0, rax will have max length. We can not use VRCX + and VRAX here for evex256 because, upper 32 bits may be + undefined for ecx and eax. */ + bsfq %rcx, %rax + cmp $CHAR_PER_VEC, %rax + ja L(align_more) + cmpq %rax, %rsi + cmovb %esi, %eax + ret + + /* At this point vector max length reached. */ + .p2align 4,,3 +L(ret_max): + movq %rsi, %rax + ret + +L(align_more): + mov %rdi, %rax + /* Align rax to VEC_SIZE. */ + andq $-VEC_SIZE, %rax + movq %rdi, %rdx + subq %rax, %rdx +# ifdef USE_AS_WCSLEN + shr $2, %VRDX +# endif + /* At this point rdx contains [w]chars already compared. */ + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx + /* At this point rdx contains number of w[char] needs to go. + Now onwards rdx will keep decrementing with each compare. */ + + /* Loop unroll 4 times for 4 vector loop. */ + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 + subq $-VEC_SIZE, %rax + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) + + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) + + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x3) + + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) + + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x4) + + subq $CHAR_PER_VEC, %rdx + jbe L(ret_max) + /* Save pointer before 4 x VEC_SIZE alignment. */ + movq %rax, %rcx + + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rax + + subq %rax, %rcx +# ifdef USE_AS_WCSLEN + shr $2, %VRCX +# endif + /* rcx contains number of [w]char will be recompared due to + alignment fixes. rdx must be incremented by rcx to offset + alignment adjustment. */ + addq %rcx, %rdx + /* Need jump as we don't want to add/subtract rdx for first + iteration of 4 x VEC_SIZE aligned loop. */ + + .p2align 4,,11 +L(loop): + /* VPMINU and VPCMP combination provide better performance as + compared to alternative combinations. */ + VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) + + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k1 + + subq $-(VEC_SIZE * 4), %rax + KORTEST %k0, %k1 + + jnz L(loopend) + subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop) + mov %rsi, %rax + ret + +L(loopend): + + VPTESTN %VMM(1), %VMM(1), %k2 + KMOV %k2, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + + KMOV %k0, %VRCX + /* At this point, if k0 is non zero, null char must be in the + second vector. */ + test %VRCX, %VRCX + jnz L(ret_vec_x2) + + VPTESTN %VMM(3), %VMM(3), %k3 + KMOV %k3, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x3) + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k1, %VRCX + + /* Fourth, third, second vector terminating are pretty much + same, implemented this way to avoid branching and reuse code + from pre loop exit condition. */ +L(ret_vec_x4): + bsf %VRCX, %VRCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + subq $-(VEC_SIZE * 3), %rax + shrq $2, %rax + addq %rcx, %rax +# else + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax +# endif + + cmpq %rsi, %rax + cmovnb %rsi, %rax + ret + +L(ret_vec_x3): + bsf %VRCX, %VRCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + subq $-(VEC_SIZE * 2), %rax + shrq $2, %rax + addq %rcx, %rax +# else + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax +# endif + cmpq %rsi, %rax + cmovnb %rsi, %rax + ret + +L(ret_vec_x2): + subq $-VEC_SIZE, %rax +L(ret_vec_x1): + bsf %VRCX, %VRCX + subq %rdi, %rax +# ifdef USE_AS_WCSLEN + shrq $2, %rax +# endif + addq %rcx, %rax + cmpq %rsi, %rax + cmovnb %rsi, %rax + ret + +L(page_cross): + mov %rdi, %rax + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx +# ifdef USE_AS_WCSLEN + sarl $2, %ecx +# endif + /* ecx contains number of w[char] to be skipped as a result + of address alignment. */ + andq $-VEC_SIZE, %rax + VPCMPEQ (%rax), %VMM(0), %k0 + KMOV %k0, %VRDX + /* Ignore number of character for alignment adjustment. */ + shr %cl, %VRDX + jnz L(page_cross_end) + movl $CHAR_PER_VEC, %eax + sub %ecx, %eax + cmp %rax, %rsi + ja L(align_more) + +L(page_cross_end): + bsf %VRDX, %VRAX + cmpq %rsi, %rax + cmovnb %esi, %eax + ret + +END (STRNLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S index f59c372b78..aff288a66b 100644 --- a/sysdeps/x86_64/multiarch/wcslen-evex512.S +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S @@ -1,4 +1,8 @@ -#define STRLEN __wcslen_evex512 +#ifndef WCSLEN +# define WCSLEN __wcslen_evex512 +#endif + +#define STRLEN WCSLEN #define USE_AS_WCSLEN 1 #include "strlen-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S index 73dcf2f210..1c37d74fc9 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S @@ -1,5 +1,8 @@ -#define STRLEN __wcsnlen_evex512 +#ifndef WCSNLEN +# define WCSNLEN __wcsnlen_evex512 +#endif + +#define STRNLEN WCSNLEN #define USE_AS_WCSLEN 1 -#define USE_AS_STRNLEN 1 -#include "strlen-evex512.S" +#include "strnlen-evex512.S"