diff mbox series

x86: Unifies 'strnlen-evex' and 'strnlen-evex512' implementations.

Message ID 20240809220509.3631574-1-matthew.sterrett@intel.com
State New
Headers show
Series x86: Unifies 'strnlen-evex' and 'strnlen-evex512' implementations. | expand

Commit Message

Matthew Sterrett Aug. 9, 2024, 10:05 p.m. UTC
This commit uses a common implementation 'strnlen-evex-base.S' for both
'strnlen-evex' and 'strnlen-evex512'

This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.

All tests pass on x86.

Benchmarks were taken on SKX.
https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html

Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953

Code Size Changes:
    strnlen-evex       :  +31 bytes
    strnlen-evex512    :  +156 bytes
---
 sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strnlen-evex.S      | 428 +----------------
 sysdeps/x86_64/multiarch/strnlen-evex512.S   | 259 +----------
 3 files changed, 469 insertions(+), 680 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S

Comments

Noah Goldstein Aug. 18, 2024, 11:29 p.m. UTC | #1
On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett
<matthew.sterrett@intel.com> wrote:
>
> This commit uses a common implementation 'strnlen-evex-base.S' for both
> 'strnlen-evex' and 'strnlen-evex512'
>
> This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.
>
> All tests pass on x86.
>
> Benchmarks were taken on SKX.
> https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
>
> Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
> Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953
>
> Code Size Changes:
>     strnlen-evex       :  +31 bytes
>     strnlen-evex512    :  +156 bytes
> ---
>  sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/strnlen-evex.S      | 428 +----------------
>  sysdeps/x86_64/multiarch/strnlen-evex512.S   | 259 +----------
>  3 files changed, 469 insertions(+), 680 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S
>
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> new file mode 100644
> index 0000000000..1c2cfdfe06
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> @@ -0,0 +1,462 @@
> +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
> +   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +#ifdef USE_AS_WCSLEN
> +# define VPCMPEQ       vpcmpeqd
> +# define VPTESTN       vptestnmd
> +# define VPMINU        vpminud
> +# define CHAR_SIZE     4
> +#else
> +# define VPCMPEQ       vpcmpeqb
> +# define VPTESTN       vptestnmb
> +# define VPMINU        vpminub
> +# define CHAR_SIZE     1
> +#endif
> +
> +#define XZERO  VMM_128(0)
> +#define VZERO  VMM(0)
> +#define PAGE_SIZE      4096
> +#define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
> +
> +#if CHAR_PER_VEC == 32
> +# define SUB_SHORT(imm, reg)   subb $(imm), %VGPR_SZ(reg, 8)
> +#else
> +# define SUB_SHORT(imm, reg)   subl $(imm), %VGPR_SZ(reg, 32)
> +#endif
> +
> +#ifdef USE_AS_WCSLEN
> +/* For wide-character, we care more about limitting code size
> +   than optimally aligning targets, so just cap nop padding
> +   reasonably low.  */
> +# define P2ALIGN(...)  .p2align 4,, 6
> +# define P2ALIGN_CLAMPED(...)  P2ALIGN(__VA_ARGS__)
> +#else
> +# define P2ALIGN(x)    .p2align x
> +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
> +#endif
> +
> +       .section SECTION(.text), "ax", @progbits
> +       /* Aligning entry point to 64 byte, provides better performance for
> +          one vector length string.  */
> +ENTRY_P2ALIGN(STRNLEN, 6)
> +       /* rdi is pointer to array, rsi is the upper limit.  */
> +
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +
> +#ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#endif
> +
> +       vpxorq  %XZERO, %XZERO, %XZERO
> +
> +       /* Check that we won't cross a page boundary with our first load.  */
> +       movl    %edi, %eax
> +       shll    $20, %eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> +       ja      L(crosses_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> +          null byte.  */
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* If src (rcx) is zero, bsf does not change the result.  NB:
> +          Must use 64-bit bsf here so that upper bits of len are not
> +          cleared.  */
> +       movq    %rsi, %rax
> +       bsfq    %rcx, %rax
> +
> +       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> +          CHAR) and rsi must be > CHAR_PER_VEC.  */
> +       cmpq    $CHAR_PER_VEC, %rax
> +       ja      L(more_1x_vec)
> +
> +       /* Check if first match in bounds.  */
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +#if VEC_SIZE == 32
> +       P2ALIGN_CLAMPED(4, 2)
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +#endif
> +
> +       P2ALIGN_CLAMPED(4, 10)
> +L(more_1x_vec):
> +L(cross_page_continue):
> +       /* After this calculation, rax stores the number of elements
> +          left to be processed The complexity comes from the fact some
> +          elements get read twice due to alignment and we need to be
> +          sure we don't count them twice (else, it would just be rsi -
> +          CHAR_PER_VEC).  */
> +
> +#ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       movq    %rdi, %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> +#else
> +       /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
> +          then subtract ptr to get the new aligned limit value.  */
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +#endif
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> +
> +       /* Checking here is faster for 256-bit but not 512-bit */
> +#if VEC_SIZE == 0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +#endif
> +
> +       cmpq    $(CHAR_PER_VEC * 2), %rax
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec_or_less):
> +
> +       /* Checking here is faster for 512-bit but not 256-bit */
> +#if VEC_SIZE != 0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +#endif
> +
> +       /* Check for the end of data.  */
> +       SUB_SHORT (CHAR_PER_VEC, rax)
> +       jbe     L(max_0)
> +
> +       /* Check the final remaining vector.  */
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +#if VEC_SIZE == 32
> +       jz      L(max_0)
> +#else
> +       jnz     L(last_vec_check)
> +       P2ALIGN_CLAMPED(4, 2)
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +
> +#endif
> +       P2ALIGN_CLAMPED(4, 4)
> +L(last_vec_check):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %edx
> +       lea     (%rsi, %rdx), %eax
> +       cmovae  %esi, %eax
> +       ret
> +
> +
> +#if VEC_SIZE == 32
> +       P2ALIGN_CLAMPED(4, 8)
> +#endif
> +L(last_4x_vec_or_less):
> +       addl    $(CHAR_PER_VEC * -4), %eax
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> +
> +#if VEC_SIZE == 64
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +#endif
> +
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2), %eax
> +       jbe     L(last_2x_vec_or_less)
> +
> +       P2ALIGN_CLAMPED(4, 6)
> +L(more_2x_vec):
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
> +
> +       /* Already checked in 256-bit case */
> +#if VEC_SIZE != 0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1)
> +#endif
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x2)
> +
> +       cmpq    $(CHAR_PER_VEC * 4), %rax
> +       ja      L(more_4x_vec)
> +
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       addl    $(CHAR_PER_VEC * -2), %eax
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       subb    $(CHAR_PER_VEC), %al
> +       jbe     L(max_1)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +L(max_1):
> +       movl    %esi, %eax
> +       ret
> +
> +
> +       P2ALIGN_CLAMPED(4, 14)
> +L(first_vec_x2):
> +#if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> +       ret
> +       P2ALIGN_CLAMPED(4, 6)
> +#else
> +       addl    $CHAR_PER_VEC, %esi
> +#endif
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> +       ret
> +
> +#if VEC_SIZE == 64
> +       P2ALIGN_CLAMPED(4, 6)
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> +       ret
> +       P2ALIGN_CLAMPED(4, 6)
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> +       ret
> +#endif
> +
> +       P2ALIGN_CLAMPED(6, 20)
> +L(more_4x_vec):
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x4)
> +
> +       /* Check if at last VEC_SIZE * 4 length before aligning for the
> +          loop.  */
> +       cmpq    $(CHAR_PER_VEC * 8), %rax
> +       jbe     L(last_4x_vec_or_less)
> +
> +
> +       /* Compute number of words checked after aligning.  */
> +#ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> +#else
> +       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> +#endif
> +
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +       /* Align data to VEC_SIZE * 4.  */
> +#if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +#else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +#endif
> +
> +#ifdef USE_AS_WCSLEN
> +       subq    %rdi, %rdx
> +       sarq    $2, %rdx
> +       addq    %rdx, %rax
> +#else
> +       subq    %rdi, %rax
> +#endif
> +
> +       // mov     %rdi, %rdx
> +
> +       P2ALIGN(6)
> +L(loop):
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k1
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       KORTEST %k0, %k1
> +
> +       jnz     L(loopend)
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       ja      L(loop)
> +       mov     %rsi, %rax
> +       ret
> +
> +
> +#if VEC_SIZE == 32
> +       P2ALIGN_CLAMPED(4, 6)
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> +       ret
> +       P2ALIGN_CLAMPED(4, 6)
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> +       ret
> +#endif
> +
> +
> +       P2ALIGN_CLAMPED(4, 11)
> +L(loopend):
> +       /* We found a null terminator in one of the 4 vectors.  */
> +
> +       /* Check the first vector.  */
> +       movq    %rax, %r8
> +       VPTESTN %VMM(1), %VMM(1), %k2
> +       KMOV    %k2, %VRCX
> +       bsf     %rcx, %r8
> +
> +       cmpq    $(CHAR_PER_VEC), %r8
> +       jbe     L(end_vec)
> +
> +       /* Check the second vector.  */
> +       subq    $(CHAR_PER_VEC), %rax
> +       movq    %rax, %r8
> +       KMOV    %k0, %VRCX
> +       bsf     %rcx, %r8
> +
> +       cmpq    $(CHAR_PER_VEC), %r8
> +       jbe     L(end_vec)
> +
> +       /* Check the third vector.  */
> +       subq    $(CHAR_PER_VEC), %rax
> +       movq    %rax, %r8
> +       VPTESTN %VMM(3), %VMM(3), %k2
> +       KMOV    %k2, %VRCX
> +       bsf     %rcx, %r8
> +
> +       cmpq    $(CHAR_PER_VEC), %r8
> +       jbe     L(end_vec)
> +
> +       /* It is in the fourth vector.  */
> +       subq    $(CHAR_PER_VEC), %rax
> +       movq    %rax, %r8
> +       KMOV    %k1, %VRCX
> +       bsf     %rcx, %r8
> +
> +       P2ALIGN_CLAMPED(4, 3)
> +L(end_vec):
> +       /* Get the number that has been processed.  */
> +       movq    %rsi, %rcx
> +       subq    %rax, %rcx
> +
> +       /* Add that to the offset we found the null terminator at.  */
> +       leaq    (%r8, %rcx), %rax
> +
> +       /* Take the min of that and the limit.  */
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +       ret
> +
> +       P2ALIGN_CLAMPED(4, 11)
> +L(crosses_page_boundary):
> +       /* Align data backwards to VEC_SIZE.  */
> +       shrl    $20, %eax
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +#ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +#endif
> +       /* By this point rax contains number of bytes we need to skip.  */
> +       shrx    %VRAX, %VRCX, %VRCX
> +
> +       /* Calculates CHAR_PER_VEC - eax and stores in eax.  */
> +       negl    %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +
> +       movq    %rsi, %rdx
> +       bsf     %VRCX, %VRDX
> +       cmpq    %rax, %rdx
> +       ja      L(cross_page_continue)
> +
> +       /* The vector had a null terminator or we are at the limit.  */
> +       movl    %edx, %eax
> +       cmpq    %rdx, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +END(STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> index 91b16830eb..cba585452c 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> @@ -1,423 +1,7 @@
> -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> -   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <isa-level.h>
> -#include <sysdep.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# ifndef VEC_SIZE
> -#  include "x86-evex256-vecs.h"
> -# endif
> -
> -
> -# ifndef STRNLEN
> -#  define STRNLEN      __strnlen_evex
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> -#  define VPCMPEQ      vpcmpeqd
> -#  define VPCMPNEQ     vpcmpneqd
> -#  define VPTESTN      vptestnmd
> -#  define VPTEST       vptestmd
> -#  define VPMINU       vpminud
> -#  define CHAR_SIZE    4
> -
> -# else
> -#  define VPCMPEQ      vpcmpeqb
> -#  define VPCMPNEQ     vpcmpneqb
> -#  define VPTESTN      vptestnmb
> -#  define VPTEST       vptestmb
> -#  define VPMINU       vpminub
> -#  define CHAR_SIZE    1
> -
> -#  define REG_WIDTH    VEC_SIZE
> -# endif
> -
> -# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> -
> -# include "reg-macros.h"
> -
> -# if CHAR_PER_VEC == 32
> -#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
> -# else
> -#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
> -# endif
> -
> -
> -
> -# if CHAR_PER_VEC == 64
> -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> -# else
> -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> -# endif
> -
> -
> -# define XZERO VMM_128(0)
> -# define VZERO VMM(0)
> -# define PAGE_SIZE     4096
> -
> -       .section SECTION(.text), "ax", @progbits
> -ENTRY_P2ALIGN (STRNLEN, 6)
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(zero)
> -# ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -# endif
> -
> -       movl    %edi, %eax
> -       vpxorq  %XZERO, %XZERO, %XZERO
> -       andl    $(PAGE_SIZE - 1), %eax
> -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page_boundary)
> -
> -       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> -          null byte.  */
> -       VPCMPEQ (%rdi), %VZERO, %k0
> -
> -       KMOV    %k0, %VRCX
> -       movq    %rsi, %rax
> -
> -       /* If src (rcx) is zero, bsf does not change the result.  NB:
> -          Must use 64-bit bsf here so that upper bits of len are not
> -          cleared.  */
> -       bsfq    %rcx, %rax
> -       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> -          CHAR) and rsi must be > CHAR_PER_VEC.  */
> -       cmpq    $CHAR_PER_VEC, %rax
> -       ja      L(more_1x_vec)
> -       /* Check if first match in bounds.  */
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -       ret
> -
> -
> -# if CHAR_PER_VEC != 32
> -       .p2align 4,, 2
> -L(zero):
> -L(max_0):
> -       movl    %esi, %eax
> -       ret
> -# endif
> -
> -       /* Aligned more for strnlen compares remaining length vs 2 *
> -          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> -          going to the loop.  */
> -       .p2align 4,, 10
> -L(more_1x_vec):
> -L(cross_page_continue):
> -       /* Compute number of words checked after aligning.  */
> -# ifdef USE_AS_WCSLEN
> -       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> -          overflow.  */
> -       movq    %rdi, %rax
> -       andq    $(VEC_SIZE * -1), %rdi
> -       subq    %rdi, %rax
> -       sarq    $2, %rax
> -       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> -# else
> -       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> -       andq    $(VEC_SIZE * -1), %rdi
> -       subq    %rdi, %rax
> -# endif
> -
> -
> -       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> -
> -       cmpq    $(CHAR_PER_VEC * 2), %rax
> -       ja      L(more_2x_vec)
> -
> -L(last_2x_vec_or_less):
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_check)
> -
> -       /* Check the end of data.  */
> -       SUB_SHORT (CHAR_PER_VEC, rax)
> -       jbe     L(max_0)
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jz      L(max_0)
> -       /* Best place for LAST_VEC_CHECK if ZMM.  */
> -       .p2align 4,, 8
> -L(last_vec_check):
> -       bsf     %VRDX, %VRDX
> -       sub     %eax, %edx
> -       lea     (%rsi, %rdx), %eax
> -       cmovae  %esi, %eax
> -       ret
> -
> -# if CHAR_PER_VEC == 32
> -       .p2align 4,, 2
> -L(zero):
> -L(max_0):
> -       movl    %esi, %eax
> -       ret
> -# endif
> -
> -       .p2align 4,, 8
> -L(last_4x_vec_or_less):
> -       addl    $(CHAR_PER_VEC * -4), %eax
> -       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> -       subq    $(VEC_SIZE * -4), %rdi
> -       cmpl    $(CHAR_PER_VEC * 2), %eax
> -       jbe     L(last_2x_vec_or_less)
> -
> -       .p2align 4,, 6
> -L(more_2x_vec):
> -       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> -          rechecking bounds.  */
> -
> -       KMOV    %k0, %VRDX
> -
> -       test    %VRDX, %VRDX
> -       jnz     L(first_vec_x1)
> -
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(first_vec_x2)
> -
> -       cmpq    $(CHAR_PER_VEC * 4), %rax
> -       ja      L(more_4x_vec)
> -
> -
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -       addl    $(CHAR_PER_VEC * -2), %eax
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_check)
> -
> -       subl    $(CHAR_PER_VEC), %eax
> -       jbe     L(max_1)
> -
> -       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_check)
> -L(max_1):
> -       movl    %esi, %eax
> -       ret
> -
> -       .p2align 4,, 3
> -L(first_vec_x2):
> -# if VEC_SIZE == 64
> -       /* If VEC_SIZE == 64 we can fit logic for full return label in
> -          spare bytes before next cache line.  */
> -       bsf     %VRDX, %VRDX
> -       sub     %eax, %esi
> -       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> -       ret
> -       .p2align 4,, 6
> -# else
> -       addl    $CHAR_PER_VEC, %esi
> -# endif
> -L(first_vec_x1):
> -       bsf     %VRDX, %VRDX
> -       sub     %eax, %esi
> -       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> -       ret
> -
> -
> -       .p2align 4,, 6
> -L(first_vec_x4):
> -# if VEC_SIZE == 64
> -       /* If VEC_SIZE == 64 we can fit logic for full return label in
> -          spare bytes before next cache line.  */
> -       bsf     %VRDX, %VRDX
> -       sub     %eax, %esi
> -       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> -       ret
> -       .p2align 4,, 6
> -# else
> -       addl    $CHAR_PER_VEC, %esi
> -# endif
> -L(first_vec_x3):
> -       bsf     %VRDX, %VRDX
> -       sub     %eax, %esi
> -       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> -       ret
> -
> -       .p2align 4,, 5
> -L(more_4x_vec):
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(first_vec_x3)
> -
> -       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(first_vec_x4)
> -
> -       /* Check if at last VEC_SIZE * 4 length before aligning for the
> -          loop.  */
> -       cmpq    $(CHAR_PER_VEC * 8), %rax
> -       jbe     L(last_4x_vec_or_less)
> -
> -
> -       /* Compute number of words checked after aligning.  */
> -# ifdef USE_AS_WCSLEN
> -       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> -          overflow.  */
> -       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> -# else
> -       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> -# endif
> -
> -       subq    $(VEC_SIZE * -1), %rdi
> -
> -       /* Align data to VEC_SIZE * 4.  */
> -# if VEC_SIZE == 64
> -       /* Saves code size.  No evex512 processor has partial register
> -          stalls.  If that change this can be replaced with `andq
> -          $-(VEC_SIZE * 4), %rdi`.  */
> -       xorb    %dil, %dil
> -# else
> -       andq    $-(VEC_SIZE * 4), %rdi
> -# endif
> -
> -# ifdef USE_AS_WCSLEN
> -       subq    %rdi, %rdx
> -       sarq    $2, %rdx
> -       addq    %rdx, %rax
> -# else
> -       subq    %rdi, %rax
> -# endif
> -       /* Compare 4 * VEC at a time forward.  */
> -       .p2align 4,, 11
> -L(loop_4x_vec):
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> -       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> -       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> -       VPTESTN %VMM(2), %VMM(2), %k0
> -       VPTESTN %VMM(4), %VMM(4), %k2
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       /* Break if at end of length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rax
> -       jbe     L(loop_len_end)
> -
> -
> -       KORTEST %k0, %k2
> -       jz      L(loop_4x_vec)
> -
> -
> -L(loop_last_4x_vec):
> -       movq    %rsi, %rcx
> -       subq    %rax, %rsi
> -       VPTESTN %VMM(1), %VMM(1), %k1
> -       KMOV    %k1, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_x0)
> -
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_x1)
> -
> -       VPTESTN %VMM(3), %VMM(3), %k0
> -
> -       /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> -          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> -          individually, for VEC_SIZE == 32 we combine them in a single
> -          64-bit GPR.  */
> -# if CHAR_PER_VEC == 64
> -       KMOV    %k0, %VRDX
> -       test    %VRDX, %VRDX
> -       jnz     L(last_vec_x2)
> -       KMOV    %k2, %VRDX
> -# else
> -       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> -        */
> -       kmovd   %k2, %edx
> -       kmovd   %k0, %eax
> -       salq    $CHAR_PER_VEC, %rdx
> -       orq     %rax, %rdx
> -# endif
> -
> -       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> -        */
> -       bsfq    %rdx, %rdx
> -       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> -       cmpq    %rax, %rcx
> -       cmovb   %rcx, %rax
> -       ret
> -
> -       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
> -       .p2align 4,, 4
> -L(loop_len_end):
> -       KORTEST %k0, %k2
> -       jnz     L(loop_last_4x_vec)
> -       movq    %rsi, %rax
> -       ret
> -
> -
> -# if CHAR_PER_VEC == 64
> -       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> -          need return label for it.  */
> -       .p2align 4,, 8
> -L(last_vec_x2):
> -       bsf     %VRDX, %VRDX
> -       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> -       cmpq    %rax, %rcx
> -       cmovb   %rcx, %rax
> -       ret
> -# endif
> -
> -
> -       .p2align 4,, 10
> -L(last_vec_x1):
> -       addq    $CHAR_PER_VEC, %rsi
> -L(last_vec_x0):
> -       bsf     %VRDX, %VRDX
> -       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> -       cmpq    %rax, %rcx
> -       cmovb   %rcx, %rax
> -       ret
> -
> -
> -       .p2align 4,, 8
> -L(cross_page_boundary):
> -       /* Align data to VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andq    $-VEC_SIZE, %rcx
> -       VPCMPEQ (%rcx), %VZERO, %k0
> -
> -       KMOV    %k0, %VRCX
> -# ifdef USE_AS_WCSLEN
> -       shrl    $2, %eax
> -       andl    $(CHAR_PER_VEC - 1), %eax
> -# endif
> -       shrx    %VRAX, %VRCX, %VRCX
> -
> -       negl    %eax
> -       andl    $(CHAR_PER_VEC - 1), %eax
> -       movq    %rsi, %rdx
> -       bsf     %VRCX, %VRDX
> -       cmpq    %rax, %rdx
> -       ja      L(cross_page_continue)
> -       movl    %edx, %eax
> -       cmpq    %rdx, %rsi
> -       cmovb   %esi, %eax
> -       ret
> -END (STRNLEN)
> +#ifndef STRNLEN
> +#define STRNLEN __strnlen_evex
>  #endif
> +
> +#include "x86-evex256-vecs.h"
> +#include "reg-macros.h"
> +#include "strnlen-evex-base.S"
> \ No newline at end of file
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> index f8e55883bb..8ef54078f8 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -1,264 +1,7 @@
> -/* Placeholder function, not used by any processor at the moment.
> -   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
>  #ifndef STRNLEN
>  #define STRNLEN __strnlen_evex512
>  #endif
>
>  #include "x86-evex512-vecs.h"
>  #include "reg-macros.h"
> -
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (4)
> -
> -# include <sysdep.h>
> -
> -# ifdef USE_AS_WCSLEN
> -#  define VPCMPEQ      vpcmpeqd
> -#  define VPTESTN      vptestnmd
> -#  define VPMINU       vpminud
> -#  define CHAR_SIZE    4
> -# else
> -#  define VPCMPEQ      vpcmpeqb
> -#  define VPTESTN      vptestnmb
> -#  define VPMINU       vpminub
> -#  define CHAR_SIZE    1
> -# endif
> -
> -# define PAGE_SIZE     4096
> -# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> -
> -       .section SECTION(.text),"ax",@progbits
> -/* Aligning entry point to 64 byte, provides better performance for
> -   one vector length string.  */
> -ENTRY_P2ALIGN (STRNLEN, 6)
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(ret_max)
> -#  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
> -
> -       movl    %edi, %eax
> -       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> -       sall    $20, %eax
> -       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> -       ja      L(page_cross)
> -
> -       /* Compare [w]char for null, mask bit will be set for match.  */
> -       VPCMPEQ (%rdi), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       /* Store max length in rax.  */
> -       mov     %rsi, %rax
> -       /* If rcx is 0, rax will have max length.  We can not use VRCX
> -          and VRAX here for evex256 because, upper 32 bits may be
> -          undefined for ecx and eax.  */
> -       bsfq    %rcx, %rax
> -       cmp     $CHAR_PER_VEC, %rax
> -       ja      L(align_more)
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -       ret
> -
> -       /* At this point vector max length reached.  */
> -       .p2align 4,,3
> -L(ret_max):
> -       movq    %rsi, %rax
> -       ret
> -
> -L(align_more):
> -       mov     %rdi, %rax
> -       /* Align rax to VEC_SIZE.  */
> -       andq    $-VEC_SIZE, %rax
> -       movq    %rdi, %rdx
> -       subq    %rax, %rdx
> -#  ifdef USE_AS_WCSLEN
> -       shr     $2, %VRDX
> -#  endif
> -       /* At this point rdx contains [w]chars already compared.  */
> -       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
> -       /* At this point rdx contains number of w[char] needs to go.
> -          Now onwards rdx will keep decrementing with each compare.  */
> -
> -       /* Loop unroll 4 times for 4 vector loop.  */
> -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> -       subq    $-VEC_SIZE, %rax
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x1)
> -
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -
> -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x2)
> -
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -
> -       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x3)
> -
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -
> -       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x4)
> -
> -       subq    $CHAR_PER_VEC, %rdx
> -       jbe     L(ret_max)
> -       /* Save pointer before 4 x VEC_SIZE alignment.  */
> -       movq    %rax, %rcx
> -
> -       /* Align address to VEC_SIZE * 4 for loop.  */
> -       andq    $-(VEC_SIZE * 4), %rax
> -
> -       subq    %rax, %rcx
> -#  ifdef USE_AS_WCSLEN
> -       shr     $2, %VRCX
> -#  endif
> -       /* rcx contains number of [w]char will be recompared due to
> -          alignment fixes.  rdx must be incremented by rcx to offset
> -          alignment adjustment.  */
> -       addq    %rcx, %rdx
> -       /* Need jump as we don't want to add/subtract rdx for first
> -          iteration of 4 x VEC_SIZE aligned loop.  */
> -
> -       .p2align 4,,11
> -L(loop):
> -       /* VPMINU and VPCMP combination provide better performance as
> -          compared to alternative combinations.  */
> -       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> -       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> -       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> -       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> -
> -       VPTESTN %VMM(2), %VMM(2), %k0
> -       VPTESTN %VMM(4), %VMM(4), %k1
> -
> -       subq    $-(VEC_SIZE * 4), %rax
> -       KORTEST %k0, %k1
> -
> -       jnz     L(loopend)
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       ja      L(loop)
> -       mov     %rsi, %rax
> -       ret
> -
> -L(loopend):
> -
> -       VPTESTN %VMM(1), %VMM(1), %k2
> -       KMOV    %k2, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x1)
> -
> -       KMOV    %k0, %VRCX
> -       /* At this point, if k0 is non zero, null char must be in the
> -          second vector.  */
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x2)
> -
> -       VPTESTN %VMM(3), %VMM(3), %k3
> -       KMOV    %k3, %VRCX
> -       test    %VRCX, %VRCX
> -       jnz     L(ret_vec_x3)
> -       /* At this point null [w]char must be in the fourth vector so no
> -          need to check.  */
> -       KMOV    %k1, %VRCX
> -
> -       /* Fourth, third, second vector terminating are pretty much
> -          same, implemented this way to avoid branching and reuse code
> -          from pre loop exit condition.  */
> -L(ret_vec_x4):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       subq    $-(VEC_SIZE * 3), %rax
> -       shrq    $2, %rax
> -       addq    %rcx, %rax
> -# else
> -       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> -# endif
> -
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -       ret
> -
> -L(ret_vec_x3):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       subq    $-(VEC_SIZE * 2), %rax
> -       shrq    $2, %rax
> -       addq    %rcx, %rax
> -# else
> -       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> -# endif
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -       ret
> -
> -L(ret_vec_x2):
> -       subq    $-VEC_SIZE, %rax
> -L(ret_vec_x1):
> -       bsf     %VRCX, %VRCX
> -       subq    %rdi, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -# endif
> -       addq    %rcx, %rax
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> -       ret
> -
> -L(page_cross):
> -       mov     %rdi, %rax
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -# ifdef USE_AS_WCSLEN
> -       sarl    $2, %ecx
> -# endif
> -       /* ecx contains number of w[char] to be skipped as a result
> -          of address alignment.  */
> -       andq    $-VEC_SIZE, %rax
> -       VPCMPEQ (%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRDX
> -       /* Ignore number of character for alignment adjustment.  */
> -       shr     %cl, %VRDX
> -       jnz     L(page_cross_end)
> -       movl    $CHAR_PER_VEC, %eax
> -       sub     %ecx, %eax
> -       cmp     %rax, %rsi
> -       ja      L(align_more)
> -
> -L(page_cross_end):
> -       bsf     %VRDX, %VRAX
> -       cmpq    %rsi, %rax
> -       cmovnb  %esi, %eax
> -       ret
> -
> -END (STRNLEN)
> -#endif
> +#include "strnlen-evex-base.S"
> \ No newline at end of file
> --
> 2.37.2
>

LGTM

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
H.J. Lu Aug. 19, 2024, 2:35 p.m. UTC | #2
On Sun, Aug 18, 2024 at 4:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Aug 9, 2024 at 2:50 PM Matthew Sterrett
> <matthew.sterrett@intel.com> wrote:
> >
> > This commit uses a common implementation 'strnlen-evex-base.S' for both
> > 'strnlen-evex' and 'strnlen-evex512'
> >
> > This patch serves both to reduce the number of implementations, and it also does some small optimizations that benefit strnlen-evex and strnlen-evex512.
> >
> > All tests pass on x86.
> >
> > Benchmarks were taken on SKX.
> > https://www.intel.com/content/www/us/en/products/sku/123613/intel-core-i97900x-xseries-processor-13-75m-cache-up-to-4-30-ghz/specifications.html
> >
> > Geometric mean for strnlen-evex over all benchmarks (N=10) was (new/old) 0.881
> > Geometric mean for strnlen-evex512 over all benchmarks (N=10) was (new/old) 0.953
> >
> > Code Size Changes:
> >     strnlen-evex       :  +31 bytes
> >     strnlen-evex512    :  +156 bytes
> > ---
> >  sysdeps/x86_64/multiarch/strnlen-evex-base.S | 462 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strnlen-evex.S      | 428 +----------------
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S   | 259 +----------
> >  3 files changed, 469 insertions(+), 680 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex-base.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> > new file mode 100644
> > index 0000000000..1c2cfdfe06
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
> > @@ -0,0 +1,462 @@
> > +/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
> > +   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +#ifdef USE_AS_WCSLEN
> > +# define VPCMPEQ       vpcmpeqd
> > +# define VPTESTN       vptestnmd
> > +# define VPMINU        vpminud
> > +# define CHAR_SIZE     4
> > +#else
> > +# define VPCMPEQ       vpcmpeqb
> > +# define VPTESTN       vptestnmb
> > +# define VPMINU        vpminub
> > +# define CHAR_SIZE     1
> > +#endif
> > +
> > +#define XZERO  VMM_128(0)
> > +#define VZERO  VMM(0)
> > +#define PAGE_SIZE      4096
> > +#define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
> > +
> > +#if CHAR_PER_VEC == 32
> > +# define SUB_SHORT(imm, reg)   subb $(imm), %VGPR_SZ(reg, 8)
> > +#else
> > +# define SUB_SHORT(imm, reg)   subl $(imm), %VGPR_SZ(reg, 32)
> > +#endif
> > +
> > +#ifdef USE_AS_WCSLEN
> > +/* For wide-character, we care more about limitting code size
> > +   than optimally aligning targets, so just cap nop padding
> > +   reasonably low.  */
> > +# define P2ALIGN(...)  .p2align 4,, 6
> > +# define P2ALIGN_CLAMPED(...)  P2ALIGN(__VA_ARGS__)
> > +#else
> > +# define P2ALIGN(x)    .p2align x
> > +# define P2ALIGN_CLAMPED(x, y) .p2align x,, y
> > +#endif
> > +
> > +       .section SECTION(.text), "ax", @progbits
> > +       /* Aligning entry point to 64 byte, provides better performance for
> > +          one vector length string.  */
> > +ENTRY_P2ALIGN(STRNLEN, 6)
> > +       /* rdi is pointer to array, rsi is the upper limit.  */
> > +
> > +       /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(zero)
> > +
> > +#ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#endif
> > +
> > +       vpxorq  %XZERO, %XZERO, %XZERO
> > +
> > +       /* Check that we won't cross a page boundary with our first load.  */
> > +       movl    %edi, %eax
> > +       shll    $20, %eax
> > +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > +       ja      L(crosses_page_boundary)
> > +
> > +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> > +          null byte.  */
> > +       VPCMPEQ (%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRCX
> > +
> > +       /* If src (rcx) is zero, bsf does not change the result.  NB:
> > +          Must use 64-bit bsf here so that upper bits of len are not
> > +          cleared.  */
> > +       movq    %rsi, %rax
> > +       bsfq    %rcx, %rax
> > +
> > +       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> > +          CHAR) and rsi must be > CHAR_PER_VEC.  */
> > +       cmpq    $CHAR_PER_VEC, %rax
> > +       ja      L(more_1x_vec)
> > +
> > +       /* Check if first match in bounds.  */
> > +       cmpq    %rax, %rsi
> > +       cmovb   %esi, %eax
> > +       ret
> > +
> > +#if VEC_SIZE == 32
> > +       P2ALIGN_CLAMPED(4, 2)
> > +L(zero):
> > +L(max_0):
> > +       movl    %esi, %eax
> > +       ret
> > +#endif
> > +
> > +       P2ALIGN_CLAMPED(4, 10)
> > +L(more_1x_vec):
> > +L(cross_page_continue):
> > +       /* After this calculation, rax stores the number of elements
> > +          left to be processed The complexity comes from the fact some
> > +          elements get read twice due to alignment and we need to be
> > +          sure we don't count them twice (else, it would just be rsi -
> > +          CHAR_PER_VEC).  */
> > +
> > +#ifdef USE_AS_WCSLEN
> > +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > +          overflow.  */
> > +       movq    %rdi, %rax
> > +       andq    $(VEC_SIZE * -1), %rdi
> > +       subq    %rdi, %rax
> > +       sarq    $2, %rax
> > +       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> > +#else
> > +       /* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
> > +          then subtract ptr to get the new aligned limit value.  */
> > +       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> > +       andq    $(VEC_SIZE * -1), %rdi
> > +       subq    %rdi, %rax
> > +#endif
> > +
> > +       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> > +
> > +       /* Checking here is faster for 256-bit but not 512-bit */
> > +#if VEC_SIZE == 0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(last_vec_check)
> > +#endif
> > +
> > +       cmpq    $(CHAR_PER_VEC * 2), %rax
> > +       ja      L(more_2x_vec)
> > +
> > +L(last_2x_vec_or_less):
> > +
> > +       /* Checking here is faster for 512-bit but not 256-bit */
> > +#if VEC_SIZE != 0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(last_vec_check)
> > +#endif
> > +
> > +       /* Check for the end of data.  */
> > +       SUB_SHORT (CHAR_PER_VEC, rax)
> > +       jbe     L(max_0)
> > +
> > +       /* Check the final remaining vector.  */
> > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +#if VEC_SIZE == 32
> > +       jz      L(max_0)
> > +#else
> > +       jnz     L(last_vec_check)
> > +       P2ALIGN_CLAMPED(4, 2)
> > +L(zero):
> > +L(max_0):
> > +       movl    %esi, %eax
> > +       ret
> > +
> > +#endif
> > +       P2ALIGN_CLAMPED(4, 4)
> > +L(last_vec_check):
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %edx
> > +       lea     (%rsi, %rdx), %eax
> > +       cmovae  %esi, %eax
> > +       ret
> > +
> > +
> > +#if VEC_SIZE == 32
> > +       P2ALIGN_CLAMPED(4, 8)
> > +#endif
> > +L(last_4x_vec_or_less):
> > +       addl    $(CHAR_PER_VEC * -4), %eax
> > +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> > +
> > +#if VEC_SIZE == 64
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(last_vec_check)
> > +#endif
> > +
> > +       subq    $(VEC_SIZE * -4), %rdi
> > +       cmpl    $(CHAR_PER_VEC * 2), %eax
> > +       jbe     L(last_2x_vec_or_less)
> > +
> > +       P2ALIGN_CLAMPED(4, 6)
> > +L(more_2x_vec):
> > +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> > +          rechecking bounds.  */
> > +
> > +       /* Already checked in 256-bit case */
> > +#if VEC_SIZE != 0
> > +       KMOV    %k0, %VRDX
> > +
> > +       test    %VRDX, %VRDX
> > +       jnz     L(first_vec_x1)
> > +#endif
> > +
> > +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +
> > +       test    %VRDX, %VRDX
> > +       jnz     L(first_vec_x2)
> > +
> > +       cmpq    $(CHAR_PER_VEC * 4), %rax
> > +       ja      L(more_4x_vec)
> > +
> > +
> > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +       addl    $(CHAR_PER_VEC * -2), %eax
> > +       test    %VRDX, %VRDX
> > +       jnz     L(last_vec_check)
> > +
> > +       subb    $(CHAR_PER_VEC), %al
> > +       jbe     L(max_1)
> > +
> > +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +
> > +       test    %VRDX, %VRDX
> > +       jnz     L(last_vec_check)
> > +L(max_1):
> > +       movl    %esi, %eax
> > +       ret
> > +
> > +
> > +       P2ALIGN_CLAMPED(4, 14)
> > +L(first_vec_x2):
> > +#if VEC_SIZE == 64
> > +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> > +          spare bytes before next cache line.  */
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> > +       ret
> > +       P2ALIGN_CLAMPED(4, 6)
> > +#else
> > +       addl    $CHAR_PER_VEC, %esi
> > +#endif
> > +L(first_vec_x1):
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> > +       ret
> > +
> > +#if VEC_SIZE == 64
> > +       P2ALIGN_CLAMPED(4, 6)
> > +L(first_vec_x4):
> > +# if VEC_SIZE == 64
> > +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> > +          spare bytes before next cache line.  */
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > +       ret
> > +       P2ALIGN_CLAMPED(4, 6)
> > +# else
> > +       addl    $CHAR_PER_VEC, %esi
> > +# endif
> > +L(first_vec_x3):
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > +       ret
> > +#endif
> > +
> > +       P2ALIGN_CLAMPED(6, 20)
> > +L(more_4x_vec):
> > +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(first_vec_x3)
> > +
> > +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > +       KMOV    %k0, %VRDX
> > +       test    %VRDX, %VRDX
> > +       jnz     L(first_vec_x4)
> > +
> > +       /* Check if at last VEC_SIZE * 4 length before aligning for the
> > +          loop.  */
> > +       cmpq    $(CHAR_PER_VEC * 8), %rax
> > +       jbe     L(last_4x_vec_or_less)
> > +
> > +
> > +       /* Compute number of words checked after aligning.  */
> > +#ifdef USE_AS_WCSLEN
> > +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > +          overflow.  */
> > +       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> > +#else
> > +       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> > +#endif
> > +
> > +       subq    $(VEC_SIZE * -1), %rdi
> > +
> > +       /* Align data to VEC_SIZE * 4.  */
> > +#if VEC_SIZE == 64
> > +       /* Saves code size.  No evex512 processor has partial register
> > +          stalls.  If that change this can be replaced with `andq
> > +          $-(VEC_SIZE * 4), %rdi`.  */
> > +       xorb    %dil, %dil
> > +#else
> > +       andq    $-(VEC_SIZE * 4), %rdi
> > +#endif
> > +
> > +#ifdef USE_AS_WCSLEN
> > +       subq    %rdi, %rdx
> > +       sarq    $2, %rdx
> > +       addq    %rdx, %rax
> > +#else
> > +       subq    %rdi, %rax
> > +#endif
> > +
> > +       // mov     %rdi, %rdx
> > +
> > +       P2ALIGN(6)
> > +L(loop):
> > +       /* VPMINU and VPCMP combination provide better performance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> > +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> > +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> > +
> > +       VPTESTN %VMM(2), %VMM(2), %k0
> > +       VPTESTN %VMM(4), %VMM(4), %k1
> > +
> > +       subq    $-(VEC_SIZE * 4), %rdi
> > +       KORTEST %k0, %k1
> > +
> > +       jnz     L(loopend)
> > +       subq    $(CHAR_PER_VEC * 4), %rax
> > +       ja      L(loop)
> > +       mov     %rsi, %rax
> > +       ret
> > +
> > +
> > +#if VEC_SIZE == 32
> > +       P2ALIGN_CLAMPED(4, 6)
> > +L(first_vec_x4):
> > +# if VEC_SIZE == 64
> > +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> > +          spare bytes before next cache line.  */
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > +       ret
> > +       P2ALIGN_CLAMPED(4, 6)
> > +# else
> > +       addl    $CHAR_PER_VEC, %esi
> > +# endif
> > +L(first_vec_x3):
> > +       bsf     %VRDX, %VRDX
> > +       sub     %eax, %esi
> > +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > +       ret
> > +#endif
> > +
> > +
> > +       P2ALIGN_CLAMPED(4, 11)
> > +L(loopend):
> > +       /* We found a null terminator in one of the 4 vectors.  */
> > +
> > +       /* Check the first vector.  */
> > +       movq    %rax, %r8
> > +       VPTESTN %VMM(1), %VMM(1), %k2
> > +       KMOV    %k2, %VRCX
> > +       bsf     %rcx, %r8
> > +
> > +       cmpq    $(CHAR_PER_VEC), %r8
> > +       jbe     L(end_vec)
> > +
> > +       /* Check the second vector.  */
> > +       subq    $(CHAR_PER_VEC), %rax
> > +       movq    %rax, %r8
> > +       KMOV    %k0, %VRCX
> > +       bsf     %rcx, %r8
> > +
> > +       cmpq    $(CHAR_PER_VEC), %r8
> > +       jbe     L(end_vec)
> > +
> > +       /* Check the third vector.  */
> > +       subq    $(CHAR_PER_VEC), %rax
> > +       movq    %rax, %r8
> > +       VPTESTN %VMM(3), %VMM(3), %k2
> > +       KMOV    %k2, %VRCX
> > +       bsf     %rcx, %r8
> > +
> > +       cmpq    $(CHAR_PER_VEC), %r8
> > +       jbe     L(end_vec)
> > +
> > +       /* It is in the fourth vector.  */
> > +       subq    $(CHAR_PER_VEC), %rax
> > +       movq    %rax, %r8
> > +       KMOV    %k1, %VRCX
> > +       bsf     %rcx, %r8
> > +
> > +       P2ALIGN_CLAMPED(4, 3)
> > +L(end_vec):
> > +       /* Get the number that has been processed.  */
> > +       movq    %rsi, %rcx
> > +       subq    %rax, %rcx
> > +
> > +       /* Add that to the offset we found the null terminator at.  */
> > +       leaq    (%r8, %rcx), %rax
> > +
> > +       /* Take the min of that and the limit.  */
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +       ret
> > +
> > +       P2ALIGN_CLAMPED(4, 11)
> > +L(crosses_page_boundary):
> > +       /* Align data backwards to VEC_SIZE.  */
> > +       shrl    $20, %eax
> > +       movq    %rdi, %rcx
> > +       andq    $-VEC_SIZE, %rcx
> > +       VPCMPEQ (%rcx), %VZERO, %k0
> > +
> > +       KMOV    %k0, %VRCX
> > +#ifdef USE_AS_WCSLEN
> > +       shrl    $2, %eax
> > +       andl    $(CHAR_PER_VEC - 1), %eax
> > +#endif
> > +       /* By this point rax contains number of bytes we need to skip.  */
> > +       shrx    %VRAX, %VRCX, %VRCX
> > +
> > +       /* Calculates CHAR_PER_VEC - eax and stores in eax.  */
> > +       negl    %eax
> > +       andl    $(CHAR_PER_VEC - 1), %eax
> > +
> > +       movq    %rsi, %rdx
> > +       bsf     %VRCX, %VRDX
> > +       cmpq    %rax, %rdx
> > +       ja      L(cross_page_continue)
> > +
> > +       /* The vector had a null terminator or we are at the limit.  */
> > +       movl    %edx, %eax
> > +       cmpq    %rdx, %rsi
> > +       cmovb   %esi, %eax
> > +       ret
> > +
> > +END(STRNLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> > index 91b16830eb..cba585452c 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> > @@ -1,423 +1,7 @@
> > -/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> > -   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <isa-level.h>
> > -#include <sysdep.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -# ifndef VEC_SIZE
> > -#  include "x86-evex256-vecs.h"
> > -# endif
> > -
> > -
> > -# ifndef STRNLEN
> > -#  define STRNLEN      __strnlen_evex
> > -# endif
> > -
> > -# ifdef USE_AS_WCSLEN
> > -#  define VPCMPEQ      vpcmpeqd
> > -#  define VPCMPNEQ     vpcmpneqd
> > -#  define VPTESTN      vptestnmd
> > -#  define VPTEST       vptestmd
> > -#  define VPMINU       vpminud
> > -#  define CHAR_SIZE    4
> > -
> > -# else
> > -#  define VPCMPEQ      vpcmpeqb
> > -#  define VPCMPNEQ     vpcmpneqb
> > -#  define VPTESTN      vptestnmb
> > -#  define VPTEST       vptestmb
> > -#  define VPMINU       vpminub
> > -#  define CHAR_SIZE    1
> > -
> > -#  define REG_WIDTH    VEC_SIZE
> > -# endif
> > -
> > -# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > -
> > -# include "reg-macros.h"
> > -
> > -# if CHAR_PER_VEC == 32
> > -#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
> > -# else
> > -#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
> > -# endif
> > -
> > -
> > -
> > -# if CHAR_PER_VEC == 64
> > -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> > -# else
> > -#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> > -# endif
> > -
> > -
> > -# define XZERO VMM_128(0)
> > -# define VZERO VMM(0)
> > -# define PAGE_SIZE     4096
> > -
> > -       .section SECTION(.text), "ax", @progbits
> > -ENTRY_P2ALIGN (STRNLEN, 6)
> > -       /* Check zero length.  */
> > -       test    %RSI_LP, %RSI_LP
> > -       jz      L(zero)
> > -# ifdef __ILP32__
> > -       /* Clear the upper 32 bits.  */
> > -       movl    %esi, %esi
> > -# endif
> > -
> > -       movl    %edi, %eax
> > -       vpxorq  %XZERO, %XZERO, %XZERO
> > -       andl    $(PAGE_SIZE - 1), %eax
> > -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > -       ja      L(cross_page_boundary)
> > -
> > -       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> > -          null byte.  */
> > -       VPCMPEQ (%rdi), %VZERO, %k0
> > -
> > -       KMOV    %k0, %VRCX
> > -       movq    %rsi, %rax
> > -
> > -       /* If src (rcx) is zero, bsf does not change the result.  NB:
> > -          Must use 64-bit bsf here so that upper bits of len are not
> > -          cleared.  */
> > -       bsfq    %rcx, %rax
> > -       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> > -          CHAR) and rsi must be > CHAR_PER_VEC.  */
> > -       cmpq    $CHAR_PER_VEC, %rax
> > -       ja      L(more_1x_vec)
> > -       /* Check if first match in bounds.  */
> > -       cmpq    %rax, %rsi
> > -       cmovb   %esi, %eax
> > -       ret
> > -
> > -
> > -# if CHAR_PER_VEC != 32
> > -       .p2align 4,, 2
> > -L(zero):
> > -L(max_0):
> > -       movl    %esi, %eax
> > -       ret
> > -# endif
> > -
> > -       /* Aligned more for strnlen compares remaining length vs 2 *
> > -          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> > -          going to the loop.  */
> > -       .p2align 4,, 10
> > -L(more_1x_vec):
> > -L(cross_page_continue):
> > -       /* Compute number of words checked after aligning.  */
> > -# ifdef USE_AS_WCSLEN
> > -       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > -          overflow.  */
> > -       movq    %rdi, %rax
> > -       andq    $(VEC_SIZE * -1), %rdi
> > -       subq    %rdi, %rax
> > -       sarq    $2, %rax
> > -       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> > -# else
> > -       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> > -       andq    $(VEC_SIZE * -1), %rdi
> > -       subq    %rdi, %rax
> > -# endif
> > -
> > -
> > -       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> > -
> > -       cmpq    $(CHAR_PER_VEC * 2), %rax
> > -       ja      L(more_2x_vec)
> > -
> > -L(last_2x_vec_or_less):
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_check)
> > -
> > -       /* Check the end of data.  */
> > -       SUB_SHORT (CHAR_PER_VEC, rax)
> > -       jbe     L(max_0)
> > -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jz      L(max_0)
> > -       /* Best place for LAST_VEC_CHECK if ZMM.  */
> > -       .p2align 4,, 8
> > -L(last_vec_check):
> > -       bsf     %VRDX, %VRDX
> > -       sub     %eax, %edx
> > -       lea     (%rsi, %rdx), %eax
> > -       cmovae  %esi, %eax
> > -       ret
> > -
> > -# if CHAR_PER_VEC == 32
> > -       .p2align 4,, 2
> > -L(zero):
> > -L(max_0):
> > -       movl    %esi, %eax
> > -       ret
> > -# endif
> > -
> > -       .p2align 4,, 8
> > -L(last_4x_vec_or_less):
> > -       addl    $(CHAR_PER_VEC * -4), %eax
> > -       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> > -       subq    $(VEC_SIZE * -4), %rdi
> > -       cmpl    $(CHAR_PER_VEC * 2), %eax
> > -       jbe     L(last_2x_vec_or_less)
> > -
> > -       .p2align 4,, 6
> > -L(more_2x_vec):
> > -       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> > -          rechecking bounds.  */
> > -
> > -       KMOV    %k0, %VRDX
> > -
> > -       test    %VRDX, %VRDX
> > -       jnz     L(first_vec_x1)
> > -
> > -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(first_vec_x2)
> > -
> > -       cmpq    $(CHAR_PER_VEC * 4), %rax
> > -       ja      L(more_4x_vec)
> > -
> > -
> > -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -       addl    $(CHAR_PER_VEC * -2), %eax
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_check)
> > -
> > -       subl    $(CHAR_PER_VEC), %eax
> > -       jbe     L(max_1)
> > -
> > -       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_check)
> > -L(max_1):
> > -       movl    %esi, %eax
> > -       ret
> > -
> > -       .p2align 4,, 3
> > -L(first_vec_x2):
> > -# if VEC_SIZE == 64
> > -       /* If VEC_SIZE == 64 we can fit logic for full return label in
> > -          spare bytes before next cache line.  */
> > -       bsf     %VRDX, %VRDX
> > -       sub     %eax, %esi
> > -       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> > -       ret
> > -       .p2align 4,, 6
> > -# else
> > -       addl    $CHAR_PER_VEC, %esi
> > -# endif
> > -L(first_vec_x1):
> > -       bsf     %VRDX, %VRDX
> > -       sub     %eax, %esi
> > -       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> > -       ret
> > -
> > -
> > -       .p2align 4,, 6
> > -L(first_vec_x4):
> > -# if VEC_SIZE == 64
> > -       /* If VEC_SIZE == 64 we can fit logic for full return label in
> > -          spare bytes before next cache line.  */
> > -       bsf     %VRDX, %VRDX
> > -       sub     %eax, %esi
> > -       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> > -       ret
> > -       .p2align 4,, 6
> > -# else
> > -       addl    $CHAR_PER_VEC, %esi
> > -# endif
> > -L(first_vec_x3):
> > -       bsf     %VRDX, %VRDX
> > -       sub     %eax, %esi
> > -       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> > -       ret
> > -
> > -       .p2align 4,, 5
> > -L(more_4x_vec):
> > -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(first_vec_x3)
> > -
> > -       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(first_vec_x4)
> > -
> > -       /* Check if at last VEC_SIZE * 4 length before aligning for the
> > -          loop.  */
> > -       cmpq    $(CHAR_PER_VEC * 8), %rax
> > -       jbe     L(last_4x_vec_or_less)
> > -
> > -
> > -       /* Compute number of words checked after aligning.  */
> > -# ifdef USE_AS_WCSLEN
> > -       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> > -          overflow.  */
> > -       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> > -# else
> > -       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> > -# endif
> > -
> > -       subq    $(VEC_SIZE * -1), %rdi
> > -
> > -       /* Align data to VEC_SIZE * 4.  */
> > -# if VEC_SIZE == 64
> > -       /* Saves code size.  No evex512 processor has partial register
> > -          stalls.  If that change this can be replaced with `andq
> > -          $-(VEC_SIZE * 4), %rdi`.  */
> > -       xorb    %dil, %dil
> > -# else
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > -# endif
> > -
> > -# ifdef USE_AS_WCSLEN
> > -       subq    %rdi, %rdx
> > -       sarq    $2, %rdx
> > -       addq    %rdx, %rax
> > -# else
> > -       subq    %rdi, %rax
> > -# endif
> > -       /* Compare 4 * VEC at a time forward.  */
> > -       .p2align 4,, 11
> > -L(loop_4x_vec):
> > -       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> > -       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> > -       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> > -       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> > -       VPTESTN %VMM(2), %VMM(2), %k0
> > -       VPTESTN %VMM(4), %VMM(4), %k2
> > -       subq    $-(VEC_SIZE * 4), %rdi
> > -       /* Break if at end of length.  */
> > -       subq    $(CHAR_PER_VEC * 4), %rax
> > -       jbe     L(loop_len_end)
> > -
> > -
> > -       KORTEST %k0, %k2
> > -       jz      L(loop_4x_vec)
> > -
> > -
> > -L(loop_last_4x_vec):
> > -       movq    %rsi, %rcx
> > -       subq    %rax, %rsi
> > -       VPTESTN %VMM(1), %VMM(1), %k1
> > -       KMOV    %k1, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_x0)
> > -
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_x1)
> > -
> > -       VPTESTN %VMM(3), %VMM(3), %k0
> > -
> > -       /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> > -          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> > -          individually, for VEC_SIZE == 32 we combine them in a single
> > -          64-bit GPR.  */
> > -# if CHAR_PER_VEC == 64
> > -       KMOV    %k0, %VRDX
> > -       test    %VRDX, %VRDX
> > -       jnz     L(last_vec_x2)
> > -       KMOV    %k2, %VRDX
> > -# else
> > -       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> > -        */
> > -       kmovd   %k2, %edx
> > -       kmovd   %k0, %eax
> > -       salq    $CHAR_PER_VEC, %rdx
> > -       orq     %rax, %rdx
> > -# endif
> > -
> > -       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> > -        */
> > -       bsfq    %rdx, %rdx
> > -       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> > -       cmpq    %rax, %rcx
> > -       cmovb   %rcx, %rax
> > -       ret
> > -
> > -       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
> > -       .p2align 4,, 4
> > -L(loop_len_end):
> > -       KORTEST %k0, %k2
> > -       jnz     L(loop_last_4x_vec)
> > -       movq    %rsi, %rax
> > -       ret
> > -
> > -
> > -# if CHAR_PER_VEC == 64
> > -       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> > -          need return label for it.  */
> > -       .p2align 4,, 8
> > -L(last_vec_x2):
> > -       bsf     %VRDX, %VRDX
> > -       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> > -       cmpq    %rax, %rcx
> > -       cmovb   %rcx, %rax
> > -       ret
> > -# endif
> > -
> > -
> > -       .p2align 4,, 10
> > -L(last_vec_x1):
> > -       addq    $CHAR_PER_VEC, %rsi
> > -L(last_vec_x0):
> > -       bsf     %VRDX, %VRDX
> > -       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> > -       cmpq    %rax, %rcx
> > -       cmovb   %rcx, %rax
> > -       ret
> > -
> > -
> > -       .p2align 4,, 8
> > -L(cross_page_boundary):
> > -       /* Align data to VEC_SIZE.  */
> > -       movq    %rdi, %rcx
> > -       andq    $-VEC_SIZE, %rcx
> > -       VPCMPEQ (%rcx), %VZERO, %k0
> > -
> > -       KMOV    %k0, %VRCX
> > -# ifdef USE_AS_WCSLEN
> > -       shrl    $2, %eax
> > -       andl    $(CHAR_PER_VEC - 1), %eax
> > -# endif
> > -       shrx    %VRAX, %VRCX, %VRCX
> > -
> > -       negl    %eax
> > -       andl    $(CHAR_PER_VEC - 1), %eax
> > -       movq    %rsi, %rdx
> > -       bsf     %VRCX, %VRDX
> > -       cmpq    %rax, %rdx
> > -       ja      L(cross_page_continue)
> > -       movl    %edx, %eax
> > -       cmpq    %rdx, %rsi
> > -       cmovb   %esi, %eax
> > -       ret
> > -END (STRNLEN)
> > +#ifndef STRNLEN
> > +#define STRNLEN __strnlen_evex
> >  #endif
> > +
> > +#include "x86-evex256-vecs.h"
> > +#include "reg-macros.h"
> > +#include "strnlen-evex-base.S"
> > \ No newline at end of file
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > index f8e55883bb..8ef54078f8 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -1,264 +1,7 @@
> > -/* Placeholder function, not used by any processor at the moment.
> > -   Copyright (C) 2022-2024 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> >  #ifndef STRNLEN
> >  #define STRNLEN __strnlen_evex512
> >  #endif
> >
> >  #include "x86-evex512-vecs.h"
> >  #include "reg-macros.h"
> > -
> > -#include <isa-level.h>
> > -
> > -#if ISA_SHOULD_BUILD (4)
> > -
> > -# include <sysdep.h>
> > -
> > -# ifdef USE_AS_WCSLEN
> > -#  define VPCMPEQ      vpcmpeqd
> > -#  define VPTESTN      vptestnmd
> > -#  define VPMINU       vpminud
> > -#  define CHAR_SIZE    4
> > -# else
> > -#  define VPCMPEQ      vpcmpeqb
> > -#  define VPTESTN      vptestnmb
> > -#  define VPMINU       vpminub
> > -#  define CHAR_SIZE    1
> > -# endif
> > -
> > -# define PAGE_SIZE     4096
> > -# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > -
> > -       .section SECTION(.text),"ax",@progbits
> > -/* Aligning entry point to 64 byte, provides better performance for
> > -   one vector length string.  */
> > -ENTRY_P2ALIGN (STRNLEN, 6)
> > -       /* Check zero length.  */
> > -       test    %RSI_LP, %RSI_LP
> > -       jz      L(ret_max)
> > -#  ifdef __ILP32__
> > -       /* Clear the upper 32 bits.  */
> > -       movl    %esi, %esi
> > -#  endif
> > -
> > -       movl    %edi, %eax
> > -       vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> > -       sall    $20, %eax
> > -       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > -       ja      L(page_cross)
> > -
> > -       /* Compare [w]char for null, mask bit will be set for match.  */
> > -       VPCMPEQ (%rdi), %VMM(0), %k0
> > -       KMOV    %k0, %VRCX
> > -       /* Store max length in rax.  */
> > -       mov     %rsi, %rax
> > -       /* If rcx is 0, rax will have max length.  We can not use VRCX
> > -          and VRAX here for evex256 because, upper 32 bits may be
> > -          undefined for ecx and eax.  */
> > -       bsfq    %rcx, %rax
> > -       cmp     $CHAR_PER_VEC, %rax
> > -       ja      L(align_more)
> > -       cmpq    %rax, %rsi
> > -       cmovb   %esi, %eax
> > -       ret
> > -
> > -       /* At this point vector max length reached.  */
> > -       .p2align 4,,3
> > -L(ret_max):
> > -       movq    %rsi, %rax
> > -       ret
> > -
> > -L(align_more):
> > -       mov     %rdi, %rax
> > -       /* Align rax to VEC_SIZE.  */
> > -       andq    $-VEC_SIZE, %rax
> > -       movq    %rdi, %rdx
> > -       subq    %rax, %rdx
> > -#  ifdef USE_AS_WCSLEN
> > -       shr     $2, %VRDX
> > -#  endif
> > -       /* At this point rdx contains [w]chars already compared.  */
> > -       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
> > -       /* At this point rdx contains number of w[char] needs to go.
> > -          Now onwards rdx will keep decrementing with each compare.  */
> > -
> > -       /* Loop unroll 4 times for 4 vector loop.  */
> > -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > -       subq    $-VEC_SIZE, %rax
> > -       KMOV    %k0, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x1)
> > -
> > -       subq    $CHAR_PER_VEC, %rdx
> > -       jbe     L(ret_max)
> > -
> > -       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > -       KMOV    %k0, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x2)
> > -
> > -       subq    $CHAR_PER_VEC, %rdx
> > -       jbe     L(ret_max)
> > -
> > -       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> > -       KMOV    %k0, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x3)
> > -
> > -       subq    $CHAR_PER_VEC, %rdx
> > -       jbe     L(ret_max)
> > -
> > -       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> > -       KMOV    %k0, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x4)
> > -
> > -       subq    $CHAR_PER_VEC, %rdx
> > -       jbe     L(ret_max)
> > -       /* Save pointer before 4 x VEC_SIZE alignment.  */
> > -       movq    %rax, %rcx
> > -
> > -       /* Align address to VEC_SIZE * 4 for loop.  */
> > -       andq    $-(VEC_SIZE * 4), %rax
> > -
> > -       subq    %rax, %rcx
> > -#  ifdef USE_AS_WCSLEN
> > -       shr     $2, %VRCX
> > -#  endif
> > -       /* rcx contains number of [w]char will be recompared due to
> > -          alignment fixes.  rdx must be incremented by rcx to offset
> > -          alignment adjustment.  */
> > -       addq    %rcx, %rdx
> > -       /* Need jump as we don't want to add/subtract rdx for first
> > -          iteration of 4 x VEC_SIZE aligned loop.  */
> > -
> > -       .p2align 4,,11
> > -L(loop):
> > -       /* VPMINU and VPCMP combination provide better performance as
> > -          compared to alternative combinations.  */
> > -       VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> > -       VPMINU  (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
> > -       VMOVA   (VEC_SIZE * 6)(%rax), %VMM(3)
> > -       VPMINU  (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
> > -
> > -       VPTESTN %VMM(2), %VMM(2), %k0
> > -       VPTESTN %VMM(4), %VMM(4), %k1
> > -
> > -       subq    $-(VEC_SIZE * 4), %rax
> > -       KORTEST %k0, %k1
> > -
> > -       jnz     L(loopend)
> > -       subq    $(CHAR_PER_VEC * 4), %rdx
> > -       ja      L(loop)
> > -       mov     %rsi, %rax
> > -       ret
> > -
> > -L(loopend):
> > -
> > -       VPTESTN %VMM(1), %VMM(1), %k2
> > -       KMOV    %k2, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x1)
> > -
> > -       KMOV    %k0, %VRCX
> > -       /* At this point, if k0 is non zero, null char must be in the
> > -          second vector.  */
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x2)
> > -
> > -       VPTESTN %VMM(3), %VMM(3), %k3
> > -       KMOV    %k3, %VRCX
> > -       test    %VRCX, %VRCX
> > -       jnz     L(ret_vec_x3)
> > -       /* At this point null [w]char must be in the fourth vector so no
> > -          need to check.  */
> > -       KMOV    %k1, %VRCX
> > -
> > -       /* Fourth, third, second vector terminating are pretty much
> > -          same, implemented this way to avoid branching and reuse code
> > -          from pre loop exit condition.  */
> > -L(ret_vec_x4):
> > -       bsf     %VRCX, %VRCX
> > -       subq    %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > -       subq    $-(VEC_SIZE * 3), %rax
> > -       shrq    $2, %rax
> > -       addq    %rcx, %rax
> > -# else
> > -       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> > -# endif
> > -
> > -       cmpq    %rsi, %rax
> > -       cmovnb  %rsi, %rax
> > -       ret
> > -
> > -L(ret_vec_x3):
> > -       bsf     %VRCX, %VRCX
> > -       subq    %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > -       subq    $-(VEC_SIZE * 2), %rax
> > -       shrq    $2, %rax
> > -       addq    %rcx, %rax
> > -# else
> > -       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> > -# endif
> > -       cmpq    %rsi, %rax
> > -       cmovnb  %rsi, %rax
> > -       ret
> > -
> > -L(ret_vec_x2):
> > -       subq    $-VEC_SIZE, %rax
> > -L(ret_vec_x1):
> > -       bsf     %VRCX, %VRCX
> > -       subq    %rdi, %rax
> > -# ifdef USE_AS_WCSLEN
> > -       shrq    $2, %rax
> > -# endif
> > -       addq    %rcx, %rax
> > -       cmpq    %rsi, %rax
> > -       cmovnb  %rsi, %rax
> > -       ret
> > -
> > -L(page_cross):
> > -       mov     %rdi, %rax
> > -       movl    %edi, %ecx
> > -       andl    $(VEC_SIZE - 1), %ecx
> > -# ifdef USE_AS_WCSLEN
> > -       sarl    $2, %ecx
> > -# endif
> > -       /* ecx contains number of w[char] to be skipped as a result
> > -          of address alignment.  */
> > -       andq    $-VEC_SIZE, %rax
> > -       VPCMPEQ (%rax), %VMM(0), %k0
> > -       KMOV    %k0, %VRDX
> > -       /* Ignore number of character for alignment adjustment.  */
> > -       shr     %cl, %VRDX
> > -       jnz     L(page_cross_end)
> > -       movl    $CHAR_PER_VEC, %eax
> > -       sub     %ecx, %eax
> > -       cmp     %rax, %rsi
> > -       ja      L(align_more)
> > -
> > -L(page_cross_end):
> > -       bsf     %VRDX, %VRAX
> > -       cmpq    %rsi, %rax
> > -       cmovnb  %esi, %eax
> > -       ret
> > -
> > -END (STRNLEN)
> > -#endif
> > +#include "strnlen-evex-base.S"
> > \ No newline at end of file
> > --
> > 2.37.2
> >
>
> LGTM
>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

I pushed onto master branch.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strnlen-evex-base.S b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
new file mode 100644
index 0000000000..1c2cfdfe06
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex-base.S
@@ -0,0 +1,462 @@ 
+/* strnlen/wcsnlen optimized with 256/512-bit EVEX instructions.
+   Copyright (C) 2022-2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+#ifdef USE_AS_WCSLEN
+# define VPCMPEQ	vpcmpeqd
+# define VPTESTN	vptestnmd
+# define VPMINU	vpminud
+# define CHAR_SIZE	4
+#else
+# define VPCMPEQ	vpcmpeqb
+# define VPTESTN	vptestnmb
+# define VPMINU	vpminub
+# define CHAR_SIZE	1
+#endif
+
+#define XZERO	VMM_128(0)
+#define VZERO	VMM(0)
+#define PAGE_SIZE	4096
+#define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+#if CHAR_PER_VEC == 32
+# define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+#else
+# define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+#endif
+
+#ifdef USE_AS_WCSLEN
+/* For wide-character, we care more about limitting code size
+   than optimally aligning targets, so just cap nop padding
+   reasonably low.  */
+# define P2ALIGN(...)	.p2align 4,, 6
+# define P2ALIGN_CLAMPED(...)	P2ALIGN(__VA_ARGS__)
+#else
+# define P2ALIGN(x)	.p2align x
+# define P2ALIGN_CLAMPED(x, y)	.p2align x,, y
+#endif
+
+	.section SECTION(.text), "ax", @progbits
+	/* Aligning entry point to 64 byte, provides better performance for
+	   one vector length string.  */
+ENTRY_P2ALIGN(STRNLEN, 6)
+	/* rdi is pointer to array, rsi is the upper limit.  */
+
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#endif
+
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Check that we won't cross a page boundary with our first load.  */
+	movl	%edi, %eax
+	shll	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(crosses_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRCX
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	movq	%rsi, %rax
+	bsfq	%rcx, %rax
+
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+#endif
+
+	P2ALIGN_CLAMPED(4, 10)
+L(more_1x_vec):
+L(cross_page_continue):
+	/* After this calculation, rax stores the number of elements
+	   left to be processed The complexity comes from the fact some
+	   elements get read twice due to alignment and we need to be
+	   sure we don't count them twice (else, it would just be rsi -
+	   CHAR_PER_VEC).  */
+
+#ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+#else
+	/* Calculate ptr + N - VEC_SIZE, then mask off the low bits,
+	   then subtract ptr to get the new aligned limit value.  */
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+#endif
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	/* Checking here is faster for 256-bit but not 512-bit */
+#if VEC_SIZE == 0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+
+	/* Checking here is faster for 512-bit but not 256-bit */
+#if VEC_SIZE != 0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	/* Check for the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+
+	/* Check the final remaining vector.  */
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+#if VEC_SIZE == 32
+	jz	L(max_0)
+#else
+	jnz	L(last_vec_check)
+	P2ALIGN_CLAMPED(4, 2)
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+
+#endif
+	P2ALIGN_CLAMPED(4, 4)
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 8)
+#endif
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+
+#if VEC_SIZE == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+#endif
+
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	P2ALIGN_CLAMPED(4, 6)
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+
+	/* Already checked in 256-bit case */
+#if VEC_SIZE != 0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+#endif
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subb	$(CHAR_PER_VEC), %al
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+
+	P2ALIGN_CLAMPED(4, 14)
+L(first_vec_x2):
+#if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+#else
+	addl	$CHAR_PER_VEC, %esi
+#endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+#if VEC_SIZE == 64
+	P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+#endif
+
+	P2ALIGN_CLAMPED(6, 20)
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+#ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+#else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+#endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+#if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+#else
+	andq	$-(VEC_SIZE * 4), %rdi
+#endif
+
+#ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+#else
+	subq	%rdi, %rax
+#endif
+
+	// mov     %rdi, %rdx
+
+	P2ALIGN(6)
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k1
+
+	subq	$-(VEC_SIZE * 4), %rdi
+	KORTEST	%k0, %k1
+
+	jnz	L(loopend)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(loop)
+	mov	%rsi, %rax
+	ret
+
+
+#if VEC_SIZE == 32
+	P2ALIGN_CLAMPED(4, 6)
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	P2ALIGN_CLAMPED(4, 6)
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+#endif
+
+
+	P2ALIGN_CLAMPED(4, 11)
+L(loopend):
+	/* We found a null terminator in one of the 4 vectors.  */
+
+	/* Check the first vector.  */
+	movq	%rax, %r8
+	VPTESTN	%VMM(1), %VMM(1), %k2
+	KMOV	%k2, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* Check the second vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	KMOV	%k0, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* Check the third vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	KMOV	%k2, %VRCX
+	bsf	%rcx, %r8
+
+	cmpq	$(CHAR_PER_VEC), %r8
+	jbe	L(end_vec)
+
+	/* It is in the fourth vector.  */
+	subq	$(CHAR_PER_VEC), %rax
+	movq	%rax, %r8
+	KMOV	%k1, %VRCX
+	bsf	%rcx, %r8
+
+	P2ALIGN_CLAMPED(4, 3)
+L(end_vec):
+	/* Get the number that has been processed.  */
+	movq	%rsi, %rcx
+	subq	%rax, %rcx
+
+	/* Add that to the offset we found the null terminator at.  */
+	leaq	(%r8, %rcx), %rax
+
+	/* Take the min of that and the limit.  */
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+	ret
+
+	P2ALIGN_CLAMPED(4, 11)
+L(crosses_page_boundary):
+	/* Align data backwards to VEC_SIZE.  */
+	shrl	$20, %eax
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+#endif
+	/* By this point rax contains number of bytes we need to skip.  */
+	shrx	%VRAX, %VRCX, %VRCX
+
+	/* Calculates CHAR_PER_VEC - eax and stores in eax.  */
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+
+	/* The vector had a null terminator or we are at the limit.  */
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+
+END(STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 91b16830eb..cba585452c 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,423 +1,7 @@ 
-/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-#include <sysdep.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# ifndef VEC_SIZE
-#  include "x86-evex256-vecs.h"
-# endif
-
-
-# ifndef STRNLEN
-#  define STRNLEN	__strnlen_evex
-# endif
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPCMPNEQ	vpcmpneqd
-#  define VPTESTN	vptestnmd
-#  define VPTEST	vptestmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPCMPNEQ	vpcmpneqb
-#  define VPTESTN	vptestnmb
-#  define VPTEST	vptestmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-
-#  define REG_WIDTH	VEC_SIZE
-# endif
-
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-# include "reg-macros.h"
-
-# if CHAR_PER_VEC == 32
-#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
-# else
-#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
-# endif
-
-
-
-# if CHAR_PER_VEC == 64
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
-# else
-#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
-# endif
-
-
-# define XZERO	VMM_128(0)
-# define VZERO	VMM(0)
-# define PAGE_SIZE	4096
-
-	.section SECTION(.text), "ax", @progbits
-ENTRY_P2ALIGN (STRNLEN, 6)
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-# ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-# endif
-
-	movl	%edi, %eax
-	vpxorq	%XZERO, %XZERO, %XZERO
-	andl	$(PAGE_SIZE - 1), %eax
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
-	   null byte.  */
-	VPCMPEQ	(%rdi), %VZERO, %k0
-
-	KMOV	%k0, %VRCX
-	movq	%rsi, %rax
-
-	/* If src (rcx) is zero, bsf does not change the result.  NB:
-	   Must use 64-bit bsf here so that upper bits of len are not
-	   cleared.  */
-	bsfq	%rcx, %rax
-	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
-	   CHAR) and rsi must be > CHAR_PER_VEC.  */
-	cmpq	$CHAR_PER_VEC, %rax
-	ja	L(more_1x_vec)
-	/* Check if first match in bounds.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
-
-
-# if CHAR_PER_VEC != 32
-	.p2align 4,, 2
-L(zero):
-L(max_0):
-	movl	%esi, %eax
-	ret
-# endif
-
-	/* Aligned more for strnlen compares remaining length vs 2 *
-	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
-	   going to the loop.  */
-	.p2align 4,, 10
-L(more_1x_vec):
-L(cross_page_continue):
-	/* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-	   overflow.  */
-	movq	%rdi, %rax
-	andq	$(VEC_SIZE * -1), %rdi
-	subq	%rdi, %rax
-	sarq	$2, %rax
-	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
-# else
-	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
-	andq	$(VEC_SIZE * -1), %rdi
-	subq	%rdi, %rax
-# endif
-
-
-	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
-
-	cmpq	$(CHAR_PER_VEC * 2), %rax
-	ja	L(more_2x_vec)
-
-L(last_2x_vec_or_less):
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-
-	/* Check the end of data.  */
-	SUB_SHORT (CHAR_PER_VEC, rax)
-	jbe	L(max_0)
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jz	L(max_0)
-	/* Best place for LAST_VEC_CHECK if ZMM.  */
-	.p2align 4,, 8
-L(last_vec_check):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %edx
-	lea	(%rsi, %rdx), %eax
-	cmovae	%esi, %eax
-	ret
-
-# if CHAR_PER_VEC == 32
-	.p2align 4,, 2
-L(zero):
-L(max_0):
-	movl	%esi, %eax
-	ret
-# endif
-
-	.p2align 4,, 8
-L(last_4x_vec_or_less):
-	addl	$(CHAR_PER_VEC * -4), %eax
-	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
-	subq	$(VEC_SIZE * -4), %rdi
-	cmpl	$(CHAR_PER_VEC * 2), %eax
-	jbe	L(last_2x_vec_or_less)
-
-	.p2align 4,, 6
-L(more_2x_vec):
-	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
-	   rechecking bounds.  */
-
-	KMOV	%k0, %VRDX
-
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x1)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x2)
-
-	cmpq	$(CHAR_PER_VEC * 4), %rax
-	ja	L(more_4x_vec)
-
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	addl	$(CHAR_PER_VEC * -2), %eax
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-
-	subl	$(CHAR_PER_VEC), %eax
-	jbe	L(max_1)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_check)
-L(max_1):
-	movl	%esi, %eax
-	ret
-
-	.p2align 4,, 3
-L(first_vec_x2):
-# if VEC_SIZE == 64
-	/* If VEC_SIZE == 64 we can fit logic for full return label in
-	   spare bytes before next cache line.  */
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
-	ret
-	.p2align 4,, 6
-# else
-	addl	$CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x1):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
-	ret
-
-
-	.p2align 4,, 6
-L(first_vec_x4):
-# if VEC_SIZE == 64
-	/* If VEC_SIZE == 64 we can fit logic for full return label in
-	   spare bytes before next cache line.  */
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
-	ret
-	.p2align 4,, 6
-# else
-	addl	$CHAR_PER_VEC, %esi
-# endif
-L(first_vec_x3):
-	bsf	%VRDX, %VRDX
-	sub	%eax, %esi
-	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
-	ret
-
-	.p2align 4,, 5
-L(more_4x_vec):
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x3)
-
-	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(first_vec_x4)
-
-	/* Check if at last VEC_SIZE * 4 length before aligning for the
-	   loop.  */
-	cmpq	$(CHAR_PER_VEC * 8), %rax
-	jbe	L(last_4x_vec_or_less)
-
-
-	/* Compute number of words checked after aligning.  */
-# ifdef USE_AS_WCSLEN
-	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
-	   overflow.  */
-	leaq	(VEC_SIZE * -3)(%rdi), %rdx
-# else
-	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
-# endif
-
-	subq	$(VEC_SIZE * -1), %rdi
-
-	/* Align data to VEC_SIZE * 4.  */
-# if VEC_SIZE == 64
-	/* Saves code size.  No evex512 processor has partial register
-	   stalls.  If that change this can be replaced with `andq
-	   $-(VEC_SIZE * 4), %rdi`.  */
-	xorb	%dil, %dil
-# else
-	andq	$-(VEC_SIZE * 4), %rdi
-# endif
-
-# ifdef USE_AS_WCSLEN
-	subq	%rdi, %rdx
-	sarq	$2, %rdx
-	addq	%rdx, %rax
-# else
-	subq	%rdi, %rax
-# endif
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4,, 11
-L(loop_4x_vec):
-	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k2
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rax
-	jbe	L(loop_len_end)
-
-
-	KORTEST %k0, %k2
-	jz	L(loop_4x_vec)
-
-
-L(loop_last_4x_vec):
-	movq	%rsi, %rcx
-	subq	%rax, %rsi
-	VPTESTN	%VMM(1), %VMM(1), %k1
-	KMOV	%k1, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x0)
-
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x1)
-
-	VPTESTN	%VMM(3), %VMM(3), %k0
-
-	/* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
-	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
-	   individually, for VEC_SIZE == 32 we combine them in a single
-	   64-bit GPR.  */
-# if CHAR_PER_VEC == 64
-	KMOV	%k0, %VRDX
-	test	%VRDX, %VRDX
-	jnz	L(last_vec_x2)
-	KMOV	%k2, %VRDX
-# else
-	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
-	 */
-	kmovd	%k2, %edx
-	kmovd	%k0, %eax
-	salq	$CHAR_PER_VEC, %rdx
-	orq	%rax, %rdx
-# endif
-
-	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
-	 */
-	bsfq	%rdx, %rdx
-	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-
-	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
-	.p2align 4,, 4
-L(loop_len_end):
-	KORTEST %k0, %k2
-	jnz	L(loop_last_4x_vec)
-	movq	%rsi, %rax
-	ret
-
-
-# if CHAR_PER_VEC == 64
-	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
-	   need return label for it.  */
-	.p2align 4,, 8
-L(last_vec_x2):
-	bsf	%VRDX, %VRDX
-	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-# endif
-
-
-	.p2align 4,, 10
-L(last_vec_x1):
-	addq	$CHAR_PER_VEC, %rsi
-L(last_vec_x0):
-	bsf	%VRDX, %VRDX
-	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
-	cmpq	%rax, %rcx
-	cmovb	%rcx, %rax
-	ret
-
-
-	.p2align 4,, 8
-L(cross_page_boundary):
-	/* Align data to VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andq	$-VEC_SIZE, %rcx
-	VPCMPEQ	(%rcx), %VZERO, %k0
-
-	KMOV	%k0, %VRCX
-# ifdef USE_AS_WCSLEN
-	shrl	$2, %eax
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	shrx	%VRAX, %VRCX, %VRCX
-
-	negl	%eax
-	andl	$(CHAR_PER_VEC - 1), %eax
-	movq	%rsi, %rdx
-	bsf	%VRCX, %VRDX
-	cmpq	%rax, %rdx
-	ja	L(cross_page_continue)
-	movl	%edx, %eax
-	cmpq	%rdx, %rsi
-	cmovb	%esi, %eax
-	ret
-END (STRNLEN)
+#ifndef STRNLEN
+#define STRNLEN __strnlen_evex
 #endif
+
+#include "x86-evex256-vecs.h"
+#include "reg-macros.h"
+#include "strnlen-evex-base.S"
\ No newline at end of file
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
index f8e55883bb..8ef54078f8 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -1,264 +1,7 @@ 
-/* Placeholder function, not used by any processor at the moment.
-   Copyright (C) 2022-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
 #ifndef STRNLEN
 #define STRNLEN __strnlen_evex512
 #endif
 
 #include "x86-evex512-vecs.h"
 #include "reg-macros.h"
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-# include <sysdep.h>
-
-# ifdef USE_AS_WCSLEN
-#  define VPCMPEQ	vpcmpeqd
-#  define VPTESTN	vptestnmd
-#  define VPMINU	vpminud
-#  define CHAR_SIZE	4
-# else
-#  define VPCMPEQ	vpcmpeqb
-#  define VPTESTN	vptestnmb
-#  define VPMINU	vpminub
-#  define CHAR_SIZE	1
-# endif
-
-# define PAGE_SIZE	4096
-# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
-
-	.section SECTION(.text),"ax",@progbits
-/* Aligning entry point to 64 byte, provides better performance for
-   one vector length string.  */
-ENTRY_P2ALIGN (STRNLEN, 6)
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(ret_max)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-
-	movl	%edi, %eax
-	vpxorq	%VMM_128(0), %VMM_128(0), %VMM_128(0)
-	sall	$20, %eax
-	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
-	ja	L(page_cross)
-
-	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMPEQ	(%rdi), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	/* Store max length in rax.  */
-	mov	%rsi, %rax
-	/* If rcx is 0, rax will have max length.  We can not use VRCX
-	   and VRAX here for evex256 because, upper 32 bits may be
-	   undefined for ecx and eax.  */
-	bsfq	%rcx, %rax
-	cmp	$CHAR_PER_VEC, %rax
-	ja	L(align_more)
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
-
-	/* At this point vector max length reached.  */
-	.p2align 4,,3
-L(ret_max):
-	movq	%rsi, %rax
-	ret
-
-L(align_more):
-	mov	%rdi, %rax
-	/* Align rax to VEC_SIZE.  */
-	andq	$-VEC_SIZE, %rax
-	movq	%rdi, %rdx
-	subq	%rax, %rdx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRDX
-#  endif
-	/* At this point rdx contains [w]chars already compared.  */
-	leaq	-CHAR_PER_VEC(%rsi, %rdx), %rdx
-	/* At this point rdx contains number of w[char] needs to go.
-	   Now onwards rdx will keep decrementing with each compare.  */
-
-	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	subq	$-VEC_SIZE, %rax
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	VEC_SIZE(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	(VEC_SIZE * 2)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-
-	VPCMPEQ	(VEC_SIZE * 3)(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x4)
-
-	subq	$CHAR_PER_VEC, %rdx
-	jbe	L(ret_max)
-	/* Save pointer before 4 x VEC_SIZE alignment.  */
-	movq	%rax, %rcx
-
-	/* Align address to VEC_SIZE * 4 for loop.  */
-	andq	$-(VEC_SIZE * 4), %rax
-
-	subq	%rax, %rcx
-#  ifdef USE_AS_WCSLEN
-	shr	$2, %VRCX
-#  endif
-	/* rcx contains number of [w]char will be recompared due to
-	   alignment fixes.  rdx must be incremented by rcx to offset
-	   alignment adjustment.  */
-	addq	%rcx, %rdx
-	/* Need jump as we don't want to add/subtract rdx for first
-	   iteration of 4 x VEC_SIZE aligned loop.  */
-
-	.p2align 4,,11
-L(loop):
-	/* VPMINU and VPCMP combination provide better performance as
-	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM(1)
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2)
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM(3)
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4)
-
-	VPTESTN	%VMM(2), %VMM(2), %k0
-	VPTESTN	%VMM(4), %VMM(4), %k1
-
-	subq	$-(VEC_SIZE * 4), %rax
-	KORTEST	%k0, %k1
-
-	jnz	L(loopend)
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop)
-	mov	%rsi, %rax
-	ret
-
-L(loopend):
-
-	VPTESTN	%VMM(1), %VMM(1), %k2
-	KMOV	%k2, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x1)
-
-	KMOV	%k0, %VRCX
-	/* At this point, if k0 is non zero, null char must be in the
-	   second vector.  */
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x2)
-
-	VPTESTN	%VMM(3), %VMM(3), %k3
-	KMOV	%k3, %VRCX
-	test	%VRCX, %VRCX
-	jnz	L(ret_vec_x3)
-	/* At this point null [w]char must be in the fourth vector so no
-	   need to check.  */
-	KMOV	%k1, %VRCX
-
-	/* Fourth, third, second vector terminating are pretty much
-	   same, implemented this way to avoid branching and reuse code
-	   from pre loop exit condition.  */
-L(ret_vec_x4):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 3), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
-# endif
-
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(ret_vec_x3):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	subq	$-(VEC_SIZE * 2), %rax
-	shrq	$2, %rax
-	addq	%rcx, %rax
-# else
-	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
-# endif
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(ret_vec_x2):
-	subq	$-VEC_SIZE, %rax
-L(ret_vec_x1):
-	bsf	%VRCX, %VRCX
-	subq	%rdi, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
-	addq	%rcx, %rax
-	cmpq	%rsi, %rax
-	cmovnb	%rsi, %rax
-	ret
-
-L(page_cross):
-	mov	%rdi, %rax
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-# ifdef USE_AS_WCSLEN
-	sarl	$2, %ecx
-# endif
-	/* ecx contains number of w[char] to be skipped as a result
-	   of address alignment.  */
-	andq	$-VEC_SIZE, %rax
-	VPCMPEQ	(%rax), %VMM(0), %k0
-	KMOV	%k0, %VRDX
-	/* Ignore number of character for alignment adjustment.  */
-	shr	%cl, %VRDX
-	jnz	L(page_cross_end)
-	movl    $CHAR_PER_VEC, %eax
-	sub     %ecx, %eax
-	cmp	%rax, %rsi
-	ja	L(align_more)
-
-L(page_cross_end):
-	bsf	%VRDX, %VRAX
-	cmpq	%rsi, %rax
-	cmovnb	%esi, %eax
-	ret
-
-END (STRNLEN)
-#endif
+#include "strnlen-evex-base.S"
\ No newline at end of file