diff mbox series

[v2] x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations.

Message ID 20231214223805.853145-1-matthew.sterrett@intel.com
State New
Headers show
Series [v2] x86: Unifies 'strlen-evex' and 'strlen-evex512' implementations. | expand

Commit Message

Matthew Sterrett Dec. 14, 2023, 10:38 p.m. UTC
This commit uses a common implementation 'strlen-evex-base.S' for both
'strlen-evex' and 'strlen-evex512'

The motivation is to reduce the number of implementations to maintain.
This incidentally gives a small performance improvement.

All tests pass on x86.

Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html

Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965

Code Size Changes:
    strlen-evex512.S    :  +24 bytes
    wcslen-evex512.S    :  +54 bytes
---
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------
 sysdeps/x86_64/multiarch/strlen-evex.S      | 250 +------------
 sysdeps/x86_64/multiarch/strnlen-evex512.S  | 266 +++++++++++++-
 sysdeps/x86_64/multiarch/wcslen-evex512.S   |   6 +-
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   9 +-
 5 files changed, 439 insertions(+), 472 deletions(-)

Comments

Noah Goldstein Dec. 15, 2023, 12:20 a.m. UTC | #1
On Thu, Dec 14, 2023 at 4:37 PM Matthew Sterrett
<matthew.sterrett@intel.com> wrote:
>
> This commit uses a common implementation 'strlen-evex-base.S' for both
> 'strlen-evex' and 'strlen-evex512'
>
> The motivation is to reduce the number of implementations to maintain.
> This incidentally gives a small performance improvement.
>
> All tests pass on x86.
>
> Benchmarks were taken on SKX.
> https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
>
> Geometric mean for strlen-evex512 over all benchmarks (N=10) was (new/old) 0.939
> Geometric mean for wcslen-evex512 over all benchmarks (N=10) was (new/old) 0.965
>
> Code Size Changes:
>     strlen-evex512.S    :  +24 bytes
>     wcslen-evex512.S    :  +54 bytes
> ---
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 380 ++++++++------------
>  sysdeps/x86_64/multiarch/strlen-evex.S      | 250 +------------
>  sysdeps/x86_64/multiarch/strnlen-evex512.S  | 266 +++++++++++++-
>  sysdeps/x86_64/multiarch/wcslen-evex512.S   |   6 +-
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   9 +-
>  5 files changed, 439 insertions(+), 472 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index 7305b24e28..6ea9e85aa0 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -1,5 +1,5 @@
> -/* Placeholder function, not used by any processor at the moment.
> -   Copyright (C) 2022-2023 Free Software Foundation, Inc.
> +/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
256/512 bit EVEX....
> +   Copyright (C) 2021-2023 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
>
>     The GNU C Library is free software; you can redistribute it and/or
> @@ -16,7 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -/* UNUSED. Exists purely as reference implementation.  */
>
>  #include <isa-level.h>
>
> @@ -26,272 +25,211 @@
>
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
>  #  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
>  #  define CHAR_SIZE    4
> +#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
>  # else
>  #  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
>  #  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
>  #  define CHAR_SIZE    1
> +#  define CHAR_SIZE_SHIFT_REG(reg)
> +
> +#  define REG_WIDTH    VEC_SIZE
>  # endif
>
> -# define PAGE_SIZE     4096
>  # define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -       .section SECTION(.text),"ax",@progbits
> -/* Aligning entry point to 64 byte, provides better performance for
> -   one vector length string.  */
> -ENTRY_P2ALIGN (STRLEN, 6)
> -# ifdef USE_AS_STRNLEN
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(ret_max)
> -#  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
>  # endif
>
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(STRLEN, 6)
>         movl    %edi, %eax
> -       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> -       sall    $20, %eax
> -       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> -       ja      L(page_cross)
> -
> -       /* Compare [w]char for null, mask bit will be set for match.  */
> -       VPCMPEQ (%rdi), %VMM(0), %k0
> -# ifdef USE_AS_STRNLEN
> -       KMOV    %k0, %VRCX
> -       /* Store max length in rax.  */
> -       mov     %rsi, %rax
> -       /* If rcx is 0, rax will have max length.  We can not use VRCX
> -          and VRAX here for evex256 because, upper 32 bits may be
> -          undefined for ecx and eax.  */
> -       bsfq    %rcx, %rax
> -       cmp     $CHAR_PER_VEC, %rax
> -       ja      L(align_more)
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -# else
> +       vpxorq  %XZERO, %XZERO, %XZERO
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> +          null byte.  */
> +       VPCMPEQ (%rdi), %VZERO, %k0
>         KMOV    %k0, %VRAX
>         test    %VRAX, %VRAX
> -       jz      L(align_more)
> +       jz      L(aligned_more)
>         bsf     %VRAX, %VRAX
> -# endif
>         ret
>
> -       /* At this point vector max length reached.  */
> -# ifdef USE_AS_STRNLEN
> -       .p2align 4,,3
> -L(ret_max):
> -       movq    %rsi, %rax
> +       .p2align 4,, 8
> +L(first_vec_x4):
> +       bsf     %VRAX, %VRAX
> +       subl    %ecx, %edi
> +       CHAR_SIZE_SHIFT_REG (edi)
> +       leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
>         ret
> -# endif
>
> -L(align_more):
> -       mov     %rdi, %rax
> -       /* Align rax to VEC_SIZE.  */
> -       andq    $-VEC_SIZE, %rax
> -# ifdef USE_AS_STRNLEN
> -       movq    %rdi, %rdx
> -       subq    %rax, %rdx
> -#  ifdef USE_AS_WCSLEN
> -       shr     $2, %VRDX
> -#  endif
> -       /* At this point rdx contains [w]chars already compared.  */
> -       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
> -       /* At this point rdx contains number of w[char] needs to go.
> -          Now onwards rdx will keep decrementing with each compare.  */
> -# endif
> -
> -       /* Loop unroll 4 times for 4 vector loop.  */
> -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> -       subq    $-VEC_SIZE, %rax
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x1)
>
> -# ifdef USE_AS_STRNLEN
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -# endif
>
> -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x2)
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
> +L(aligned_more):
> +       movq    %rdi, %rcx
> +       andq    $(VEC_SIZE * -1), %rdi
> +L(cross_page_continue):
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
>
> -# ifdef USE_AS_STRNLEN
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -# endif
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x3)
>
> -# ifdef USE_AS_STRNLEN
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -# endif
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x4)
>
> -       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x4)
> +       subq    $(VEC_SIZE * -1), %rdi
>
> -# ifdef USE_AS_STRNLEN
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -       /* Save pointer before 4 x VEC_SIZE alignment.  */
> -       movq    %rax, %rcx
> +# if CHAR_PER_VEC == 64
> +       /* No partial register stalls on processors that we use evex512
> +          on and this saves code size.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
>  # endif
>
> -       /* Align address to VEC_SIZE * 4 for loop.  */
> -       andq    $-(VEC_SIZE * 4), %rax
> -
> -# ifdef USE_AS_STRNLEN
> -       subq    %rax, %rcx
> -#  ifdef USE_AS_WCSLEN
> -       shr     $2, %VRCX
> -#  endif
> -       /* rcx contains number of [w]char will be recompared due to
> -          alignment fixes.  rdx must be incremented by rcx to offset
> -          alignment adjustment.  */
> -       addq    %rcx, %rdx
> -       /* Need jump as we don't want to add/subtract rdx for first
> -          iteration of 4 x VEC_SIZE aligned loop.  */
> -# endif
>
> -       .p2align 4,,11
> -L(loop):
> -       /* VPMINU and VPCMP combination provide better performance as
> -          compared to alternative combinations.  */
> -       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> -       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> -       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> -       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
>
> +       /* Compare 4 * VEC at a time forward.  */
> +       .p2align 4
> +L(loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
>         VPTESTN %VMM(2), %VMM(2), %k0
> -       VPTESTN %VMM(4), %VMM(4), %k1
> +       VPTESTN %VMM(4), %VMM(4), %k2
>
> -       subq    $-(VEC_SIZE * 4), %rax
> -       KORTEST %k0, %k1
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       KORTEST %k0, %k2
> +       jz      L(loop_4x_vec)
>
> -# ifndef USE_AS_STRNLEN
> -       jz      L(loop)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
> +
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k2, %VRAX
>  # else
> -       jnz     L(loopend)
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       ja      L(loop)
> -       mov     %rsi, %rax
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.  */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rdx, %rax
> +# endif
> +
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
> +       .p2align 4,, 2
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsfq    %rax, %rax
> +       subq    %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
>         ret
> -# endif
>
> -L(loopend):
> -
> -       VPTESTN %VMM(1), %VMM(1), %k2
> -       KMOV    %k2, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x1)
> -
> -       KMOV    %k0, %VRCX
> -       /* At this point, if k0 is non zero, null char must be in the
> -          second vector.  */
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x2)
> -
> -       VPTESTN %VMM(3), %VMM(3), %k3
> -       KMOV    %k3, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x3)
> -       /* At this point null [w]char must be in the fourth vector so no
> -          need to check.  */
> -       KMOV    %k1, %VRCX
> -
> -       /* Fourth, third, second vector terminating are pretty much
> -          same, implemented this way to avoid branching and reuse code
> -          from pre loop exit condition.  */
> -L(ret_vec_x4):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       subq    $-(VEC_SIZE * 3), %rax
> -       shrq    $2, %rax
> -       addq    %rcx, %rax
> -# else
> -       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> -# endif
> -# ifdef USE_AS_STRNLEN
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -# endif
> +       .p2align 4,, 8
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       addq    %rdi, %rax
>         ret
>
> -L(ret_vec_x3):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       subq    $-(VEC_SIZE * 2), %rax
> -       shrq    $2, %rax
> -       addq    %rcx, %rax
> -# else
> -       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> -# endif
> -# ifdef USE_AS_STRNLEN
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -# endif
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
> -L(ret_vec_x2):
> -       subq    $-VEC_SIZE, %rax
> -L(ret_vec_x1):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -# endif
> -       addq    %rcx, %rax
> -# ifdef USE_AS_STRNLEN
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -# endif
> +       .p2align 4,, 10
> +       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
> +L(TAIL_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       sub     %VRCX, %VRDI
> +       CHAR_SIZE_SHIFT_REG (VRDI)
> +       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
>         ret
>
> -L(page_cross):
> -       mov     %rdi, %rax
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> +       .p2align 4,, 8
> +L(cross_page_boundary):
> +       movq    %rdi, %rcx
> +       /* Align data to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRAX
>  # ifdef USE_AS_WCSLEN
> -       sarl    $2, %ecx
> -# endif
> -       /* ecx contains number of w[char] to be skipped as a result
> -          of address alignment.  */
> -       andq    $-VEC_SIZE, %rax
> -       VPCMPEQ (%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRDX
> -       /* Ignore number of character for alignment adjustment.  */
> -       shr     %cl, %VRDX
> -# ifdef USE_AS_STRNLEN
> -       jnz     L(page_cross_end)
> -       movl    $CHAR_PER_VEC, %eax
> -       sub     %ecx, %eax
> -       cmp     %rax, %rsi
> -       ja      L(align_more)
> +       movl    %ecx, %edx
> +       shrl    $2, %edx
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +       shrx    %edx, %eax, %eax
> +       testl   %eax, %eax
>  # else
> -       jz      L(align_more)
> -# endif
> -
> -L(page_cross_end):
> -       bsf     %VRDX, %VRAX
> -# ifdef USE_AS_STRNLEN
> -       cmpq    %rsi, %rax
> -       cmovnb  %esi, %eax
> +       shr     %cl, %VRAX
>  # endif
> +       jz      L(cross_page_continue)
> +       bsf     %VRAX, %VRAX
>         ret
>
> -END (STRLEN)
> +END(STRLEN)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 364eeffff6..93ad15e356 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -1,245 +1,7 @@
> -/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
> -   Copyright (C) 2021-2023 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRLEN
> -#  define STRLEN       __strlen_evex
> -# endif
> -
> -# ifndef VEC_SIZE
> -#  include "x86-evex256-vecs.h"
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> -#  define VPCMPEQ      vpcmpeqd
> -#  define VPCMPNEQ     vpcmpneqd
> -#  define VPTESTN      vptestnmd
> -#  define VPTEST       vptestmd
> -#  define VPMINU       vpminud
> -#  define CHAR_SIZE    4
> -#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
> -# else
> -#  define VPCMPEQ      vpcmpeqb
> -#  define VPCMPNEQ     vpcmpneqb
> -#  define VPTESTN      vptestnmb
> -#  define VPTEST       vptestmb
> -#  define VPMINU       vpminub
> -#  define CHAR_SIZE    1
> -#  define CHAR_SIZE_SHIFT_REG(reg)
> -
> -#  define REG_WIDTH    VEC_SIZE
> -# endif
> -
> -# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> -
> -# include "reg-macros.h"
> -
> -# if CHAR_PER_VEC == 64
> -
> -#  define TAIL_RETURN_LBL      first_vec_x2
> -#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> -
> -#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> -
> -# else
> -
> -#  define TAIL_RETURN_LBL      first_vec_x3
> -#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> -
> -#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> -# endif
> -
> -# define XZERO VMM_128(0)
> -# define VZERO VMM(0)
> -# define PAGE_SIZE     4096
> -
> -       .section SECTION(.text), "ax", @progbits
> -ENTRY_P2ALIGN (STRLEN, 6)
> -       movl    %edi, %eax
> -       vpxorq  %XZERO, %XZERO, %XZERO
> -       andl    $(PAGE_SIZE - 1), %eax
> -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page_boundary)
> -
> -       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> -          null byte.  */
> -       VPCMPEQ (%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jz      L(aligned_more)
> -       bsf     %VRAX, %VRAX
> -       ret
> -
> -       .p2align 4,, 8
> -L(first_vec_x4):
> -       bsf     %VRAX, %VRAX
> -       subl    %ecx, %edi
> -       CHAR_SIZE_SHIFT_REG (edi)
> -       leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> -       ret
> -
> -
> -
> -       /* Aligned more for strnlen compares remaining length vs 2 *
> -          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> -          going to the loop.  */
> -       .p2align 4,, 10
> -L(aligned_more):
> -       movq    %rdi, %rcx
> -       andq    $(VEC_SIZE * -1), %rdi
> -L(cross_page_continue):
> -       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> -          rechecking bounds.  */
> -       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x1)
> -
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x2)
> -
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x3)
> -
> -       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x4)
> -
> -       subq    $(VEC_SIZE * -1), %rdi
> -
> -# if CHAR_PER_VEC == 64
> -       /* No partial register stalls on processors that we use evex512
> -          on and this saves code size.  */
> -       xorb    %dil, %dil
> -# else
> -       andq    $-(VEC_SIZE * 4), %rdi
> -# endif
> -
> -
> -
> -       /* Compare 4 * VEC at a time forward.  */
> -       .p2align 4
> -L(loop_4x_vec):
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> -       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> -       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> -       VPTESTN %VMM(2), %VMM(2), %k0
> -       VPTESTN %VMM(4), %VMM(4), %k2
> -
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       KORTEST %k0, %k2
> -       jz      L(loop_4x_vec)
> -
> -       VPTESTN %VMM(1), %VMM(1), %k1
> -       KMOV    %k1, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x0)
> -
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x1)
> -
> -       VPTESTN %VMM(3), %VMM(3), %k0
> -
> -# if CHAR_PER_VEC == 64
> -       KMOV    %k0, %VRAX
> -       test    %VRAX, %VRAX
> -       jnz     L(first_vec_x2)
> -       KMOV    %k2, %VRAX
> -# else
> -       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> -        */
> -       kmovd   %k2, %edx
> -       kmovd   %k0, %eax
> -       salq    $CHAR_PER_VEC, %rdx
> -       orq     %rdx, %rax
> -# endif
> -
> -       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> -        */
> -       .p2align 4,, 2
> -L(FALLTHROUGH_RETURN_LBL):
> -       bsfq    %rax, %rax
> -       subq    %rcx, %rdi
> -       CHAR_SIZE_SHIFT_REG (rdi)
> -       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 8
> -L(first_vec_x0):
> -       bsf     %VRAX, %VRAX
> -       sub     %rcx, %rdi
> -       CHAR_SIZE_SHIFT_REG (rdi)
> -       addq    %rdi, %rax
> -       ret
> -
> -       .p2align 4,, 10
> -L(first_vec_x1):
> -       bsf     %VRAX, %VRAX
> -       sub     %rcx, %rdi
> -       CHAR_SIZE_SHIFT_REG (rdi)
> -       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 10
> -       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
> -        */
> -L(TAIL_RETURN_LBL):
> -       bsf     %VRAX, %VRAX
> -       sub     %VRCX, %VRDI
> -       CHAR_SIZE_SHIFT_REG (VRDI)
> -       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
> -       ret
> -
> -       .p2align 4,, 8
> -L(cross_page_boundary):
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE.  */
> -       andq    $-VEC_SIZE, %rdi
> -
> -       VPCMPEQ (%rdi), %VZERO, %k0
> -
> -       KMOV    %k0, %VRAX
> -# ifdef USE_AS_WCSLEN
> -       movl    %ecx, %edx
> -       shrl    $2, %edx
> -       andl    $(CHAR_PER_VEC - 1), %edx
> -       shrx    %edx, %eax, %eax
> -       testl   %eax, %eax
> -# else
> -       shr     %cl, %VRAX
> -# endif
> -       jz      L(cross_page_continue)
> -       bsf     %VRAX, %VRAX
> -       ret
> -
> -END (STRLEN)
> +#ifndef STRLEN
> +# define STRLEN                __strlen_evex
>  #endif
> +
> +#include "x86-evex256-vecs.h"
> +#include "reg-macros.h"
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> index 0b7f220214..ebf22c259f 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -1,4 +1,264 @@
> -#define STRLEN __strnlen_evex512
> -#define USE_AS_STRNLEN 1
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022-2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
>
> -#include "strlen-evex512.S"
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef STRNLEN
> +#define STRNLEN __strnlen_evex512
> +#endif
> +
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPTESTN      vptestnmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPTESTN      vptestnmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRNLEN, 6)
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(ret_max)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMPEQ (%rdi), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       /* Store max length in rax.  */
> +       mov     %rsi, %rax
> +       /* If rcx is 0, rax will have max length.  We can not use VRCX
> +          and VRAX here for evex256 because, upper 32 bits may be
> +          undefined for ecx and eax.  */
> +       bsfq    %rcx, %rax
> +       cmp     $CHAR_PER_VEC, %rax
> +       ja      L(align_more)
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +       /* At this point vector max length reached.  */
> +       .p2align 4,,3
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +
> +L(align_more):
> +       mov     %rdi, %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +       movq    %rdi, %rdx
> +       subq    %rax, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       shr     $2, %VRDX
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> +       subq    $-VEC_SIZE, %rax
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +
> +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x4)
> +
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +       /* Save pointer before 4 x VEC_SIZE alignment.  */
> +       movq    %rax, %rcx
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rax
> +
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       shr     $2, %VRCX
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustment.  */
> +       addq    %rcx, %rdx
> +       /* Need jump as we don't want to add/subtract rdx for first
> +          iteration of 4 x VEC_SIZE aligned loop.  */
> +
> +       .p2align 4,,11
> +L(loop):
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k1
> +
> +       subq    $-(VEC_SIZE * 4), %rax
> +       KORTEST %k0, %k1
> +
> +       jnz     L(loopend)
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       ja      L(loop)
> +       mov     %rsi, %rax
> +       ret
> +
> +L(loopend):
> +
> +       VPTESTN %VMM(1), %VMM(1), %k2
> +       KMOV    %k2, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x1)
> +
> +       KMOV    %k0, %VRCX
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x2)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k3
> +       KMOV    %k3, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k1, %VRCX
> +
> +       /* Fourth, third, second vector terminating are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(ret_vec_x4):
> +       bsf     %VRCX, %VRCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 3), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +       ret
> +
> +L(ret_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 2), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +       ret
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> +       bsf     %VRCX, %VRCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       shrq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +       ret
> +
> +L(page_cross):
> +       mov     %rdi, %rax
> +       movl    %edi, %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       andq    $-VEC_SIZE, %rax
> +       VPCMPEQ (%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRDX
> +       /* Ignore number of character for alignment adjustment.  */
> +       shr     %cl, %VRDX
> +       jnz     L(page_cross_end)
> +       movl    $CHAR_PER_VEC, %eax
> +       sub     %ecx, %eax
> +       cmp     %rax, %rsi
> +       ja      L(align_more)
> +
> +L(page_cross_end):
> +       bsf     %VRDX, %VRAX
> +       cmpq    %rsi, %rax
> +       cmovnb  %esi, %eax
> +       ret
> +
> +END (STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> index f59c372b78..aff288a66b 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -1,4 +1,8 @@
> -#define STRLEN __wcslen_evex512
> +#ifndef WCSLEN
> +# define WCSLEN        __wcslen_evex512
> +#endif
> +
> +#define STRLEN WCSLEN
>  #define USE_AS_WCSLEN 1
>
>  #include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> index 73dcf2f210..1c37d74fc9 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -1,5 +1,8 @@
> -#define STRLEN __wcsnlen_evex512
> +#ifndef WCSNLEN
> +# define WCSNLEN       __wcsnlen_evex512
> +#endif
> +
> +#define STRNLEN        WCSNLEN
>  #define USE_AS_WCSLEN 1
> -#define USE_AS_STRNLEN 1
>
> -#include "strlen-evex512.S"
> +#include "strnlen-evex512.S"
> --
> 2.37.2
>
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 7305b24e28..6ea9e85aa0 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -1,5 +1,5 @@ 
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021-2023 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,7 +16,6 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* UNUSED. Exists purely as reference implementation.  */
 
 #include <isa-level.h>
 
@@ -26,272 +25,211 @@ 
 
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	.section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRLEN, 6)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(ret_max)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
 
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	sall	$20, %eax
-	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-	ja	L(page_cross)
-
-	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMPEQ	(%rdi), %VMM(0), %k0
-# ifdef USE_AS_STRNLEN
-	KMOV	%k0, %VRCX
-	/* Store max length in rax.  */
-	mov	%rsi, %rax
-	/* If rcx is 0, rax will have max length.  We can not use VRCX
-	   and VRAX here for evex256 because, upper 32 bits may be
-	   undefined for ecx and eax.  */
-	bsfq	%rcx, %rax
-	cmp	$CHAR_PER_VEC, %rax
-	ja	L(align_more)
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-# else
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
 	KMOV	%k0, %VRAX
 	test	%VRAX, %VRAX
-	jz	L(align_more)
+	jz	L(aligned_more)
 	bsf	%VRAX, %VRAX
-# endif
 	ret
 
-	/* At this point vector max length reached.  */
-# ifdef USE_AS_STRNLEN
-	.p2align 4,,3
-L(ret_max):
-	movq	%rsi, %rax
+	.p2align 4,, 8
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 	ret
-# endif
 
-L(align_more):
-	mov	%rdi, %rax
-	/* Align rax to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rax
-# ifdef USE_AS_STRNLEN
-	movq	%rdi, %rdx
-	subq	%rax, %rdx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRDX
-#  endif
-	/* At this point rdx contains [w]chars already compared.  */
-	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
-	/* At this point rdx contains number of w[char] needs to go.
-	   Now onwards rdx will keep decrementing with each compare.  */
-# endif
-
-	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	subq	$-VEC_SIZE, %rax
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
 
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(aligned_more):
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
+L(cross_page_continue):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
 
-	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-# endif
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
 
-	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x4)
+	subq	$(VEC_SIZE * -1), %rdi
 
-# ifdef USE_AS_STRNLEN
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-	/* Save pointer before 4 x VEC_SIZE alignment.  */
-	movq	%rax, %rcx
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
 # endif
 
-	/* Align address to VEC_SIZE * 4 for loop.  */
-	andq	$-(VEC_SIZE * 4), %rax
-
-# ifdef USE_AS_STRNLEN
-	subq	%rax, %rcx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRCX
-#  endif
-	/* rcx contains number of [w]char will be recompared due to
-	   alignment fixes.  rdx must be incremented by rcx to offset
-	   alignment adjustment.  */
-	addq	%rcx, %rdx
-	/* Need jump as we don't want to add/subtract rdx for first
-	   iteration of 4 x VEC_SIZE aligned loop.  */
-# endif
 
-	.p2align 4,,11
-L(loop):
-	/* VPMINU and VPCMP combination provide better performance as
-	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
 
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
 	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k1
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k0, %k2
+	jz	L(loop_4x_vec)
 
-# ifndef USE_AS_STRNLEN
-	jz      L(loop)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	jnz	L(loopend)
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop)
-	mov	%rsi, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.  */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.  */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
-# endif
 
-L(loopend):
-
-	VPTESTN	%VMM(1), %VMM(1), %k2
-	KMOV	%k2, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	KMOV	%k0, %VRCX
-	/* At this point, if k0 is non zero, null char must be in the
-	   second vector.  */
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	VPTESTN	%VMM(3), %VMM(3), %k3
-	KMOV	%k3, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-	/* At this point null [w]char must be in the fourth vector so no
-	   need to check.  */
-	KMOV	%k1, %VRCX
-
-	/* Fourth, third, second vector terminating are pretty much
-	   same, implemented this way to avoid branching and reuse code
-	   from pre loop exit condition.  */
-L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 3), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-L(ret_vec_x3):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 2), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-L(ret_vec_x2):
-	subq	$-VEC_SIZE, %rax
-L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
-	addq	%rcx, %rax
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-# endif
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.  */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
 
-L(page_cross):
-	mov	%rdi, %rax
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
+	.p2align 4,, 8
+L(cross_page_boundary):
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	sarl	$2, %ecx
-# endif
-	/* ecx contains number of w[char] to be skipped as a result
-	   of address alignment.  */
-	andq	$-VEC_SIZE, %rax
-	VPCMPEQ	(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRDX
-	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRDX
-# ifdef USE_AS_STRNLEN
-	jnz	L(page_cross_end)
-	movl    $CHAR_PER_VEC, %eax
-	sub     %ecx, %eax
-	cmp	%rax, %rsi
-	ja	L(align_more)
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
+	testl	%eax, %eax
 # else
-	jz	L(align_more)
-# endif
-
-L(page_cross_end):
-	bsf	%VRDX, %VRAX
-# ifdef USE_AS_STRNLEN
-	cmpq	%rsi, %rax
-	cmovnb	%esi, %eax
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
 	ret
 
-END (STRLEN)
+END(STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 364eeffff6..93ad15e356 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -1,245 +1,7 @@ 
-/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2021-2023 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifndef STRLEN
-#  define STRLEN	__strlen_evex
-# endif
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPCMPNEQ	vpcmpneqd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPCMPNEQ	vpcmpneqb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-#  define CHAR_SIZE_SHIFT_REG(reg)
-
-#  define REG_WIDTH	VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 64
-
-#  define TAIL_RETURN_LBL	first_vec_x2
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-# else
-
-#  define TAIL_RETURN_LBL	first_vec_x3
-#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-
-#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-# endif
-
-# define XZERO	VMM_128(0)
-# define VZERO	VMM(0)
-# define PAGE_SIZE	4096
-
-	.section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRLEN, 6)
-	movl	%edi, %eax
-	vpxorq	%XZERO, %XZERO, %XZERO
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-	   null byte.  */
-	VPCMPEQ	(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jz	L(aligned_more)
-	bsf	%VRAX, %VRAX
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x4):
-	bsf	%VRAX, %VRAX
-	subl	%ecx, %edi
-	CHAR_SIZE_SHIFT_REG (edi)
-	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-	ret
-
-
-
-	/* Aligned more for strnlen compares remaining length vs 2 *
-	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-	   going to the loop.  */
-	.p2align 4,, 10
-L(aligned_more):
-	movq	%rdi, %rcx
-	andq	$(VEC_SIZE * -1), %rdi
-L(cross_page_continue):
-	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-	   rechecking bounds.  */
-	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x3)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x4)
-
-	subq	$(VEC_SIZE * -1), %rdi
-
-# if CHAR_PER_VEC == 64
-	/* No partial register stalls on processors that we use evex512
-	   on and this saves code size.  */
-	xorb	%dil, %dil
-# else
-	andq	$-(VEC_SIZE * 4), %rdi
-# endif
-
-
-
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
-L(loop_4x_vec):
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k2
-
-	subq	$-(VEC_SIZE * 4), %rdi
-	KORTEST %k0, %k2
-	jz	L(loop_4x_vec)
-
-	VPTESTN	%VMM(1), %VMM(1), %k1
-	KMOV	%k1, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x0)
-
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x1)
-
-	VPTESTN	%VMM(3), %VMM(3), %k0
-
-# if CHAR_PER_VEC == 64
-	KMOV	%k0, %VRAX
-	test	%VRAX, %VRAX
-	jnz	L(first_vec_x2)
-	KMOV	%k2, %VRAX
-# else
-	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-	 */
-	kmovd	%k2, %edx
-	kmovd	%k0, %eax
-	salq	$CHAR_PER_VEC, %rdx
-	orq	%rdx, %rax
-# endif
-
-	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-	 */
-	.p2align 4,, 2
-L(FALLTHROUGH_RETURN_LBL):
-	bsfq	%rax, %rax
-	subq	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(first_vec_x0):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	addq	%rdi, %rax
-	ret
-
-	.p2align 4,, 10
-L(first_vec_x1):
-	bsf	%VRAX, %VRAX
-	sub	%rcx, %rdi
-	CHAR_SIZE_SHIFT_REG (rdi)
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4,, 10
-	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
-	 */
-L(TAIL_RETURN_LBL):
-	bsf	%VRAX, %VRAX
-	sub	%VRCX, %VRDI
-	CHAR_SIZE_SHIFT_REG (VRDI)
-	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
-	ret
-
-	.p2align 4,, 8
-L(cross_page_boundary):
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rdi
-
-	VPCMPEQ	(%rdi), %VZERO, %k0
-
-	KMOV	%k0, %VRAX
-# ifdef USE_AS_WCSLEN
-	movl	%ecx, %edx
-	shrl	$2, %edx
-	andl	$(CHAR_PER_VEC - 1), %edx
-	shrx	%edx, %eax, %eax
-	testl	%eax, %eax
-# else
-	shr	%cl, %VRAX
-# endif
-	jz	L(cross_page_continue)
-	bsf	%VRAX, %VRAX
-	ret
-
-END (STRLEN)
+#ifndef STRLEN
+# define STRLEN		__strlen_evex
 #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index 0b7f220214..ebf22c259f 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,4 +1,264 @@ 
-#define STRLEN __strnlen_evex512
-#define USE_AS_STRNLEN 1
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022-2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
-#include "strlen-evex512.S"
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex512
+#endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+
+	movl	%edi, %eax
+	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMPEQ	(%rdi), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	/* Store max length in rax.  */
+	mov	%rsi, %rax
+	/* If rcx is 0, rax will have max length.  We can not use VRCX
+	   and VRAX here for evex256 because, upper 32 bits may be
+	   undefined for ecx and eax.  */
+	bsfq	%rcx, %rax
+	cmp	$CHAR_PER_VEC, %rax
+	ja	L(align_more)
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+	/* At this point vector max length reached.  */
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+
+L(align_more):
+	mov	%rdi, %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+	movq	%rdi, %rdx
+	subq	%rax, %rdx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	subq	$-VEC_SIZE, %rax
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x4)
+
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	shr	$2, %VRCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+
+	.p2align 4,,11
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+
+L(loopend):
+
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %VRCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	KMOV	%k3, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %VRCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x3):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%VRCX, %VRCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq	$-VEC_SIZE, %rax
+	VPCMPEQ	(%rax), %VMM(0), %k0
+	KMOV	%k0, %VRDX
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRDX
+	jnz	L(page_cross_end)
+	movl    $CHAR_PER_VEC, %eax
+	sub     %ecx, %eax
+	cmp	%rax, %rsi
+	ja	L(align_more)
+
+L(page_cross_end):
+	bsf	%VRDX, %VRAX
+	cmpq	%rsi, %rax
+	cmovnb	%esi, %eax
+	ret
+
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
index f59c372b78..aff288a66b 100644
--- a/sysdeps/x86_64/multiarch/wcslen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -1,4 +1,8 @@ 
-#define STRLEN __wcslen_evex512
+#ifndef WCSLEN
+# define WCSLEN	__wcslen_evex512
+#endif
+
+#define STRLEN WCSLEN
 #define USE_AS_WCSLEN 1
 
 #include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
index 73dcf2f210..1c37d74fc9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -1,5 +1,8 @@ 
-#define STRLEN __wcsnlen_evex512
+#ifndef WCSNLEN
+# define WCSNLEN	__wcsnlen_evex512
+#endif
+
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex512.S"
+#include "strnlen-evex512.S"