diff mbox series

[v1,3/7] x86: Optimize strnlen-evex.S and implement with VMM headers

Message ID 20221018024901.3381469-3-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand

Commit Message

Noah Goldstein Oct. 18, 2022, 2:48 a.m. UTC
Optimizations are:
1. Use the fact that bsf(0) leaves the destination unchanged to save a
   branch in short string case.
2. Restructure code so that small strings are given the hot path.
        - This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the strnlen and
strlen code essentially incompatible so split strnlen-evex
to a new file.

Code Size Changes:
strlen-evex.S       :  -23 bytes
strnlen-evex.S      : -167 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strlen-evex.S       : 0.992 (No real change)
strnlen-evex.S      : 0.947

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
 sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
 sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
 3 files changed, 572 insertions(+), 404 deletions(-)

Comments

Noah Goldstein Oct. 18, 2022, 2:51 a.m. UTC | #1
On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
> 1. Use the fact that bsf(0) leaves the destination unchanged to save a
>    branch in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the strnlen and
> strlen code essentially incompatible so split strnlen-evex
> to a new file.
>
> Code Size Changes:
> strlen-evex.S       :  -23 bytes
> strnlen-evex.S      : -167 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strlen-evex.S       : 0.992 (No real change)
> strnlen-evex.S      : 0.947
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
>  sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
>  sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
>  3 files changed, 572 insertions(+), 404 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 2109ec2f7a..487846f098 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -26,466 +26,220 @@
>  #  define STRLEN       __strlen_evex
>  # endif
>
> -# define VMOVA         vmovdqa64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WCSLEN
> -#  define VPCMP                vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
> -#  define SHIFT_REG ecx
>  #  define CHAR_SIZE    4
> +#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
>  # else
> -#  define VPCMP                vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
> -#  define SHIFT_REG edx
>  #  define CHAR_SIZE    1
> +#  define CHAR_SIZE_SHIFT_REG(reg)
> +
> +#  define REG_WIDTH    VEC_SIZE
>  # endif
>
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -
> -# define VEC_SIZE 32
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRLEN)
> -# ifdef USE_AS_STRNLEN
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(zero)
> -#  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
> -       mov     %RSI_LP, %R8_LP
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
>  # endif
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRLEN, 6)
>         movl    %edi, %eax
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -       /* Clear high bits from edi. Only keeping bits relevant to page
> -          cross check.  */
> +       vpxorq  %XZERO, %XZERO, %XZERO
>         andl    $(PAGE_SIZE - 1), %eax
> -       /* Check if we may cross page boundary with one vector load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
>            null byte.  */
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -# ifdef USE_AS_STRNLEN
> -       /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $CHAR_PER_VEC, %rsi
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -       ret
> -# ifdef USE_AS_STRNLEN
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Set bit for max len so that tzcnt will return min of max len
> -          and position of first match.  */
> -       btsq    %rsi, %rax
> -       tzcntl  %eax, %eax
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    CHAR_PER_VEC(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
> -# endif
> +       bsf     %VRAX, %VRAX
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> +       .p2align 4,, 8
>  L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> +       bsf     %VRAX, %VRAX
> +       subl    %ecx, %edi
> +       CHAR_SIZE_SHIFT_REG (edi)
>         leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> -# endif
>         ret
>
> -       .p2align 5
> +
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
>  L(aligned_more):
> -       movq    %rdi, %rdx
> -       /* Align data to VEC_SIZE.  */
> -       andq    $-(VEC_SIZE), %rdi
> +       movq    %rdi, %rcx
> +       andq    $(VEC_SIZE * -1), %rdi
>  L(cross_page_continue):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -# ifdef USE_AS_STRNLEN
> -       /* + CHAR_SIZE because it simplies the logic in
> -          last_4x_vec_or_less.  */
> -       leaq    (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
> -       subq    %rdx, %rcx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> -# ifdef USE_AS_STRNLEN
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rcx, %rsi
> -       jb      L(last_4x_vec_or_less)
> -# endif
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       test    %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x4)
>
> -       addq    $VEC_SIZE, %rdi
> -# ifdef USE_AS_STRNLEN
> -       /* Check if at last VEC_SIZE * 4 length.  */
> -       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> -       jbe     L(last_4x_vec_or_less_load)
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -       /* Readjust length.  */
> -       addq    %rcx, %rsi
> -# endif
> -       /* Align data to VEC_SIZE * 4.  */
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +# if CHAR_PER_VEC == 64
> +       /* No partial register stalls on processors that we use evex512
> +          on and this saves code size.  */
> +       xorb    %dil, %dil
> +# else
>         andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +
>
>         /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Load first VEC regardless.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -# ifdef USE_AS_STRNLEN
> -       /* Break if at end of length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rsi
> -       jb      L(last_4x_vec_or_less_cmpeq)
> -# endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> -       VPMINU  (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> -       VPMINU  (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
>
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       VPCMP   $0, %YMM4, %YMMZERO, %k1
>         subq    $-(VEC_SIZE * 4), %rdi
> -       kortestd        %k0, %k1
> +       KORTEST %k0, %k2
>         jz      L(loop_4x_vec)
>
> -       /* Check if end was in first half.  */
> -       kmovd   %k0, %eax
> -       subq    %rdx, %rdi
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rdi
> -# endif
> -       testl   %eax, %eax
> -       jz      L(second_vec_return)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
>
> -       VPCMP   $0, %YMM1, %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       /* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
> -# ifdef USE_AS_WCSLEN
> -       sall    $CHAR_PER_VEC, %eax
> -       orl     %edx, %eax
> -       tzcntl  %eax, %eax
> -# else
> -       salq    $CHAR_PER_VEC, %rax
> -       orq     %rdx, %rax
> -       tzcntq  %rax, %rax
> -# endif
> -       addq    %rdi, %rax
> -       ret
> -
> -
> -# ifdef USE_AS_STRNLEN
> -
> -L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, %YMM1, %YMMZERO, %k0
> -       addq    $(VEC_SIZE * 3), %rdi
> -L(last_4x_vec_or_less):
> -       kmovd   %k0, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> -       testl   $(CHAR_PER_VEC * 2), %esi
> -       jnz     L(last_4x_vec)
> -
> -       /* length may have been negative or positive by an offset of
> -          CHAR_PER_VEC * 4 depending on where this was called from. This
> -          fixes that.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
>
> -       /* Check the end of data.  */
> -       subl    $CHAR_PER_VEC, %esi
> -       jb      L(max)
> +       VPTESTN %VMM(3), %VMM(3), %k0
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -L(max):
> -       movq    %r8, %rax
> -       ret
> -# endif
> -
> -       /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
> -          in the 4x VEC loop can use 2 byte encoding.  */
> -       .p2align 4
> -L(second_vec_return):
> -       VPCMP   $0, %YMM3, %YMMZERO, %k0
> -       /* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
> -# ifdef USE_AS_WCSLEN
> -       kunpckbw        %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k2, %VRAX
>  # else
> -       kunpckdq        %k0, %k1, %k0
> -       kmovq   %k0, %rax
> -       tzcntq  %rax, %rax
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rdx, %rax
>  # endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
>
> -
> -# ifdef USE_AS_STRNLEN
> -L(last_vec_x1_check):
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       .p2align 4,, 2
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsfq    %rax, %rax
> +       subq    %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec):
> -       /* Test first 2x VEC normally.  */
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       /* Normalize length.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       /* Check the end of data.  */
> -       subl    $(CHAR_PER_VEC * 3), %esi
> -       jb      L(max)
> -
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
> +       .p2align 4,, 8
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
>         leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       subl    $(CHAR_PER_VEC * 2), %esi
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
> -       ret
> -L(max_end):
> -       movq    %r8, %rax
> +       .p2align 4,, 10
> +       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
> +        */
> +L(TAIL_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       sub     %VRCX, %VRDI
> +       CHAR_SIZE_SHIFT_REG (VRDI)
> +       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
>         ret
> -# endif
>
> -       /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
> -       movq    %rdi, %rdx
> +       movq    %rdi, %rcx
>         /* Align data to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rdi
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       /* Remove the leading bytes.  */
> +
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRAX
>  # ifdef USE_AS_WCSLEN
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       movl    %edx, %ecx
> -       shrl    $2, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -# endif
> -       /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
> -       sarxl   %SHIFT_REG, %eax, %eax
> +       movl    %ecx, %edx
> +       shrl    $2, %edx
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +       shrx    %edx, %eax, %eax
>         testl   %eax, %eax
> -# ifndef USE_AS_STRNLEN
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -       ret
>  # else
> -       jnz     L(cross_page_less_vec)
> -#  ifndef USE_AS_WCSLEN
> -       movl    %edx, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -#  endif
> -       movl    $CHAR_PER_VEC, %eax
> -       subl    %ecx, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       ja      L(cross_page_continue)
> -       movl    %esi, %eax
> -       ret
> -L(cross_page_less_vec):
> -       tzcntl  %eax, %eax
> -       /* Select min of length and position of first null.  */
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -       ret
> +       shr     %cl, %VRAX
>  # endif
> +       jz      L(cross_page_continue)
> +       bsf     %VRAX, %VRAX
> +       ret
>
>  END (STRLEN)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> index 64a9fc2606..443a32a749 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> @@ -1,8 +1,423 @@
> -#ifndef STRNLEN
> -# define STRNLEN __strnlen_evex
> -#endif
> +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNLEN
> +#  define STRNLEN      __strnlen_evex
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +
> +#  define REG_WIDTH    VEC_SIZE
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 32
> +#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
> +# else
> +#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
> +# endif
> +
> +
> +
> +# if CHAR_PER_VEC == 64
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +# else
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif
> +
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRNLEN, 6)
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +# endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %XZERO, %XZERO, %XZERO
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> +          null byte.  */
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +       movq    %rsi, %rax
> +
> +       /* If src (rcx) is zero, bsf does not change the result.  NB:
> +          Must use 64-bit bsf here so that upper bits of len are not
> +          cleared.  */
> +       bsfq    %rcx, %rax
> +       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> +          CHAR) and rsi must be > CHAR_PER_VEC.  */
> +       cmpq    $CHAR_PER_VEC, %rax
> +       ja      L(more_1x_vec)
> +       /* Check if first match in bounds.  */
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +
> +# if CHAR_PER_VEC != 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +L(cross_page_continue):
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       movq    %rdi, %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> +# else
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +# endif
> +
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> +
> +       cmpq    $(CHAR_PER_VEC * 2), %rax
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec_or_less):
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       /* Check the end of data.  */
> +       SUB_SHORT (CHAR_PER_VEC, rax)
> +       jbe     L(max_0)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jz      L(max_0)
> +       /* Best place for LAST_VEC_CHECK if ZMM.  */
> +       .p2align 4,, 8
> +L(last_vec_check):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %edx
> +       lea     (%rsi, %rdx), %eax
> +       cmovae  %esi, %eax
> +       ret
> +
> +# if CHAR_PER_VEC == 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       .p2align 4,, 8
> +L(last_4x_vec_or_less):
> +       addl    $(CHAR_PER_VEC * -4), %eax
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2), %eax
> +       jbe     L(last_2x_vec_or_less)
> +
> +       .p2align 4,, 6
> +L(more_2x_vec):
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
>
> -#define USE_AS_STRNLEN 1
> -#define STRLEN STRNLEN
> +       KMOV    %k0, %VRDX
>
> -#include "strlen-evex.S"
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x2)
> +
> +       cmpq    $(CHAR_PER_VEC * 4), %rax
> +       ja      L(more_4x_vec)
> +
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       addl    $(CHAR_PER_VEC * -2), %eax
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       subl    $(CHAR_PER_VEC), %eax
> +       jbe     L(max_1)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +L(max_1):
> +       movl    %esi, %eax
> +       ret
> +
> +       .p2align 4,, 3
> +L(first_vec_x2):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> +       ret
> +
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> +       ret
> +
> +       .p2align 4,, 5
> +L(more_4x_vec):
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x4)
> +
> +       /* Check if at last VEC_SIZE * 4 length before aligning for the
> +          loop.  */
> +       cmpq    $(CHAR_PER_VEC * 8), %rax
> +       jbe     L(last_4x_vec_or_less)
> +
> +
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> +# else
> +       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> +# endif
> +
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +       /* Align data to VEC_SIZE * 4.  */
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +       subq    %rdi, %rdx
> +       sarq    $2, %rdx
> +       addq    %rdx, %rax
> +# else
> +       subq    %rdi, %rax
> +# endif
> +       /* Compare 4 * VEC at a time forward.  */
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       /* Break if at end of length.  */
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jbe     L(loop_len_end)
> +
> +
> +       KORTEST %k0, %k2
> +       jz      L(loop_4x_vec)
> +
> +
> +L(loop_last_4x_vec):
> +       movq    %rsi, %rcx
> +       subq    %rax, %rsi
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x0)
> +
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k2, %VRDX
> +# else
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rax, %rdx
> +# endif
> +
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       bsfq    %rdx, %rdx
> +       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
> +       .p2align 4,, 4
> +L(loop_len_end):
> +       KORTEST %k0, %k2
> +       jnz     L(loop_last_4x_vec)
> +       movq    %rsi, %rax
> +       ret
> +
> +
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> +          need return label for it.  */
> +       .p2align 4,, 8
> +L(last_vec_x2):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 10
> +L(last_vec_x1):
> +       addq    $CHAR_PER_VEC, %rsi
> +L(last_vec_x0):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(cross_page_boundary):
> +       /* Align data to VEC_SIZE.  */
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +# endif
> +       shrx    %VRAX, %VRCX, %VRCX
> +
> +       negl    %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +       movq    %rsi, %rdx
> +       bsf     %VRCX, %VRDX
> +       cmpq    %rax, %rdx
> +       ja      L(cross_page_continue)
> +       movl    %edx, %eax
> +       cmpq    %rdx, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +END (STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> index e2aad94c1e..57a7e93fbf 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> @@ -2,8 +2,7 @@
>  # define WCSNLEN       __wcsnlen_evex
>  #endif
>
> -#define STRLEN WCSNLEN
> +#define STRNLEN        WCSNLEN
>  #define USE_AS_WCSLEN 1
> -#define USE_AS_STRNLEN 1
>
> -#include "strlen-evex.S"
> +#include "strnlen-evex.S"
> --
> 2.34.1
>
Results For: strlen
alignment,length ,__strlen_evex ,__strlen_evex_orig 
0        ,0      ,2.789         ,2.836              ,0.983 
0        ,1      ,2.576         ,2.59               ,0.995 
0        ,1024   ,18.366        ,18.987             ,0.967 
0        ,1152   ,19.69         ,20.571             ,0.957 
0        ,128    ,5.532         ,5.481              ,1.009 
0        ,1280   ,21.278        ,22.211             ,0.958 
0        ,1408   ,22.981        ,23.668             ,0.971 
0        ,1536   ,25.244        ,24.822             ,1.017 
0        ,16     ,2.832         ,2.832              ,1.0   
0        ,160    ,8.36          ,8.71               ,0.96  
0        ,1664   ,26.608        ,26.666             ,0.998 
0        ,1792   ,28.21         ,28.953             ,0.974 
0        ,192    ,9.399         ,8.475              ,1.109 
0        ,1920   ,29.609        ,30.389             ,0.974 
0        ,2      ,3.652         ,3.779              ,0.966 
0        ,2048   ,31.087        ,32.884             ,0.945 
0        ,224    ,9.305         ,8.356              ,1.114 
0        ,2304   ,34.284        ,35.183             ,0.974 
0        ,256    ,9.083         ,10.019             ,0.907 
0        ,2560   ,36.909        ,40.442             ,0.913 
0        ,2816   ,43.14         ,48.723             ,0.885 
0        ,288    ,9.432         ,9.851              ,0.957 
0        ,3      ,2.636         ,2.608              ,1.011 
0        ,3072   ,58.749        ,66.729             ,0.88  
0        ,32     ,4.239         ,4.272              ,0.992 
0        ,320    ,10.685        ,9.969              ,1.072 
0        ,3328   ,69.222        ,68.331             ,1.013 
0        ,352    ,10.704        ,9.7                ,1.104 
0        ,3584   ,72.488        ,72.329             ,1.002 
0        ,384    ,10.635        ,11.528             ,0.923 
0        ,3840   ,74.933        ,76.039             ,0.985 
0        ,4      ,2.777         ,2.743              ,1.013 
0        ,4096   ,79.241        ,77.521             ,1.022 
0        ,416    ,11.036        ,11.535             ,0.957 
0        ,448    ,12.466        ,11.544             ,1.08  
0        ,4608   ,84.571        ,84.503             ,1.001 
0        ,480    ,12.479        ,11.472             ,1.088 
0        ,5      ,2.923         ,2.784              ,1.05  
0        ,512    ,12.12         ,12.888             ,0.94  
0        ,5120   ,91.334        ,91.435             ,0.999 
0        ,5632   ,98.695        ,95.914             ,1.029 
0        ,576    ,13.732        ,12.493             ,1.099 
0        ,6      ,2.928         ,2.75               ,1.064 
0        ,6144   ,104.673       ,102.746            ,1.019 
0        ,64     ,4.372         ,4.281              ,1.021 
0        ,640    ,13.884        ,14.217             ,0.977 
0        ,6656   ,112.122       ,110.392            ,1.016 
0        ,7      ,2.798         ,2.763              ,1.012 
0        ,704    ,15.31         ,14.697             ,1.042 
0        ,7168   ,117.652       ,114.757            ,1.025 
0        ,768    ,15.406        ,16.286             ,0.946 
0        ,7680   ,122.809       ,121.845            ,1.008 
0        ,8      ,2.83          ,2.818              ,1.004 
0        ,832    ,17.179        ,16.597             ,1.035 
0        ,896    ,16.906        ,17.978             ,0.94  
0        ,96     ,4.933         ,4.884              ,1.01  
0        ,960    ,18.548        ,18.041             ,1.028 
1        ,1      ,2.594         ,2.619              ,0.991 
10       ,1024   ,18.161        ,19.003             ,0.956 
10       ,682    ,14.286        ,14.158             ,1.009 
11       ,1365   ,23.596        ,21.917             ,1.077 
11       ,2048   ,31.044        ,32.299             ,0.961 
12       ,2730   ,50.067        ,52.292             ,0.957 
12       ,4096   ,79.161        ,78.804             ,1.005 
2        ,2      ,3.055         ,3.22               ,0.949 
2        ,4      ,2.818         ,2.836              ,0.994 
3        ,3      ,2.699         ,2.896              ,0.932 
3        ,5      ,2.843         ,2.852              ,0.997 
3        ,8      ,2.837         ,2.839              ,0.999 
4        ,10     ,2.84          ,2.825              ,1.005 
4        ,16     ,2.811         ,2.826              ,0.994 
4        ,4      ,2.715         ,2.714              ,1.0   
5        ,21     ,2.782         ,2.824              ,0.985 
5        ,32     ,4.189         ,4.222              ,0.992 
5        ,5      ,2.721         ,2.701              ,1.007 
6        ,42     ,4.295         ,4.211              ,1.02  
6        ,6      ,2.775         ,2.81               ,0.988 
6        ,64     ,4.224         ,4.27               ,0.989 
7        ,1024   ,18.286        ,18.987             ,0.963 
7        ,128    ,5.4           ,5.343              ,1.011 
7        ,16     ,2.846         ,2.836              ,1.003 
7        ,2048   ,31.003        ,32.319             ,0.959 
7        ,256    ,9.04          ,9.946              ,0.909 
7        ,32     ,4.219         ,4.218              ,1.0   
7        ,4      ,2.909         ,2.906              ,1.001 
7        ,4096   ,79.073        ,78.896             ,1.002 
7        ,512    ,12.178        ,12.742             ,0.956 
7        ,64     ,4.368         ,4.519              ,0.967 
7        ,7      ,2.762         ,2.771              ,0.997 
7        ,8      ,2.867         ,2.839              ,1.01  
7        ,85     ,4.187         ,4.336              ,0.966 
8        ,170    ,7.993         ,8.372              ,0.955 
8        ,256    ,9.016         ,9.91               ,0.91  
9        ,341    ,10.593        ,9.577              ,1.106 
9        ,512    ,11.939        ,12.694             ,0.941 
0.9925909850217739
Results For: strnlen
align,len  ,max_char ,maxlen ,__strnlen_evex ,__strnlen_evex_orig 
0    ,1    ,127      ,0      ,8.826          ,10.545              ,0.837 
0    ,1    ,127      ,1      ,8.36           ,9.794               ,0.854 
0    ,1    ,127      ,128    ,8.707          ,8.733               ,0.997 
0    ,1    ,127      ,2      ,8.43           ,9.042               ,0.932 
0    ,1    ,127      ,5000   ,8.226          ,8.442               ,0.974 
0    ,1024 ,127      ,1024   ,50.898         ,54.809              ,0.929 
0    ,1024 ,127      ,1056   ,61.814         ,56.289              ,1.098 
0    ,1024 ,127      ,1088   ,61.941         ,57.059              ,1.086 
0    ,1024 ,127      ,1120   ,61.708         ,57.166              ,1.079 
0    ,1024 ,127      ,1152   ,61.88          ,57.664              ,1.073 
0    ,1024 ,127      ,1184   ,62.084         ,60.571              ,1.025 
0    ,1024 ,127      ,1216   ,61.799         ,60.38               ,1.023 
0    ,1024 ,127      ,1248   ,61.836         ,60.313              ,1.025 
0    ,1024 ,127      ,1280   ,61.829         ,60.038              ,1.03  
0    ,1024 ,127      ,1312   ,61.932         ,60.317              ,1.027 
0    ,1024 ,127      ,1344   ,61.726         ,60.014              ,1.029 
0    ,1024 ,127      ,1376   ,62.018         ,60.242              ,1.029 
0    ,1024 ,127      ,1408   ,61.944         ,60.107              ,1.031 
0    ,1024 ,127      ,1440   ,61.799         ,59.875              ,1.032 
0    ,1024 ,127      ,1472   ,61.891         ,60.589              ,1.021 
0    ,1024 ,127      ,1504   ,61.95          ,59.84               ,1.035 
0    ,1024 ,127      ,1536   ,61.757         ,59.769              ,1.033 
0    ,1024 ,127      ,1568   ,61.685         ,60.345              ,1.022 
0    ,1024 ,127      ,1600   ,61.986         ,60.672              ,1.022 
0    ,1024 ,127      ,1632   ,61.845         ,60.189              ,1.028 
0    ,1024 ,127      ,1664   ,61.971         ,61.093              ,1.014 
0    ,1024 ,127      ,1696   ,61.855         ,60.162              ,1.028 
0    ,1024 ,127      ,1728   ,63.386         ,59.919              ,1.058 
0    ,1024 ,127      ,320    ,26.779         ,30.15               ,0.888 
0    ,1024 ,127      ,352    ,26.779         ,30.898              ,0.867 
0    ,1024 ,127      ,384    ,26.768         ,32.851              ,0.815 
0    ,1024 ,127      ,416    ,31.668         ,31.878              ,0.993 
0    ,1024 ,127      ,448    ,31.654         ,33.63               ,0.941 
0    ,1024 ,127      ,480    ,31.685         ,34.387              ,0.921 
0    ,1024 ,127      ,5000   ,61.853         ,62.0                ,0.998 
0    ,1024 ,127      ,512    ,31.67          ,37.012              ,0.856 
0    ,1024 ,127      ,544    ,36.553         ,37.076              ,0.986 
0    ,1024 ,127      ,576    ,36.533         ,38.968              ,0.938 
0    ,1024 ,127      ,608    ,36.527         ,40.962              ,0.892 
0    ,1024 ,127      ,640    ,36.512         ,41.935              ,0.871 
0    ,1024 ,127      ,672    ,41.601         ,40.159              ,1.036 
0    ,1024 ,127      ,704    ,43.111         ,43.128              ,1.0   
0    ,1024 ,127      ,736    ,41.645         ,44.285              ,0.94  
0    ,1024 ,127      ,768    ,41.631         ,46.597              ,0.893 
0    ,1024 ,127      ,800    ,46.671         ,46.504              ,1.004 
0    ,1024 ,127      ,832    ,46.815         ,47.772              ,0.98  
0    ,1024 ,127      ,864    ,46.688         ,51.689              ,0.903 
0    ,1024 ,127      ,896    ,46.743         ,52.56               ,0.889 
0    ,1024 ,127      ,928    ,51.212         ,51.64               ,0.992 
0    ,1024 ,127      ,960    ,51.243         ,53.334              ,0.961 
0    ,1024 ,127      ,992    ,51.256         ,54.768              ,0.936 
0    ,1056 ,127      ,1024   ,51.215         ,55.52               ,0.922 
0    ,1056 ,127      ,512    ,31.646         ,36.902              ,0.858 
0    ,1088 ,127      ,1024   ,51.259         ,56.534              ,0.907 
0    ,1088 ,127      ,512    ,31.647         ,36.903              ,0.858 
0    ,112  ,127      ,16     ,8.512          ,9.287               ,0.917 
0    ,1120 ,127      ,1024   ,51.303         ,55.574              ,0.923 
0    ,1120 ,127      ,512    ,31.644         ,37.558              ,0.843 
0    ,1152 ,127      ,1024   ,51.252         ,56.372              ,0.909 
0    ,1152 ,127      ,512    ,31.647         ,37.888              ,0.835 
0    ,1184 ,127      ,1024   ,51.223         ,56.414              ,0.908 
0    ,1184 ,127      ,512    ,31.635         ,36.859              ,0.858 
0    ,1216 ,127      ,1024   ,51.243         ,55.82               ,0.918 
0    ,1216 ,127      ,512    ,31.66          ,36.881              ,0.858 
0    ,1248 ,127      ,1024   ,51.211         ,55.607              ,0.921 
0    ,128  ,127      ,1      ,8.815          ,8.894               ,0.991 
0    ,128  ,127      ,128    ,15.165         ,17.562              ,0.863 
0    ,128  ,127      ,160    ,18.865         ,20.212              ,0.933 
0    ,128  ,127      ,192    ,17.618         ,16.757              ,1.051 
0    ,128  ,127      ,224    ,17.609         ,16.766              ,1.05  
0    ,128  ,127      ,256    ,17.597         ,16.589              ,1.061 
0    ,128  ,127      ,288    ,17.592         ,17.272              ,1.019 
0    ,128  ,127      ,32     ,8.262          ,8.96                ,0.922 
0    ,128  ,127      ,320    ,17.6           ,16.518              ,1.065 
0    ,128  ,127      ,352    ,17.601         ,16.965              ,1.037 
0    ,128  ,127      ,384    ,17.595         ,16.917              ,1.04  
0    ,128  ,127      ,416    ,17.608         ,16.805              ,1.048 
0    ,128  ,127      ,448    ,17.599         ,17.616              ,0.999 
0    ,128  ,127      ,480    ,17.604         ,16.925              ,1.04  
0    ,128  ,127      ,5000   ,17.6           ,17.169              ,1.025 
0    ,128  ,127      ,512    ,17.617         ,16.877              ,1.044 
0    ,128  ,127      ,544    ,17.618         ,16.679              ,1.056 
0    ,128  ,127      ,576    ,17.588         ,17.283              ,1.018 
0    ,128  ,127      ,608    ,17.611         ,17.113              ,1.029 
0    ,128  ,127      ,64     ,11.588         ,16.35               ,0.709 
0    ,128  ,127      ,640    ,17.596         ,16.752              ,1.05  
0    ,128  ,127      ,672    ,17.606         ,16.778              ,1.049 
0    ,128  ,127      ,704    ,17.591         ,17.232              ,1.021 
0    ,128  ,127      ,736    ,17.605         ,16.987              ,1.036 
0    ,128  ,127      ,768    ,17.619         ,17.879              ,0.985 
0    ,128  ,127      ,800    ,17.605         ,17.371              ,1.013 
0    ,128  ,127      ,832    ,17.603         ,16.967              ,1.037 
0    ,128  ,127      ,96     ,12.339         ,16.454              ,0.75  
0    ,1280 ,127      ,1024   ,51.193         ,55.361              ,0.925 
0    ,1312 ,127      ,1024   ,51.2           ,56.589              ,0.905 
0    ,1344 ,127      ,1024   ,51.203         ,55.915              ,0.916 
0    ,1344 ,127      ,2048   ,75.041         ,70.123              ,1.07  
0    ,1376 ,127      ,1024   ,51.251         ,55.31               ,0.927 
0    ,1376 ,127      ,2048   ,75.027         ,70.119              ,1.07  
0    ,1408 ,127      ,1024   ,51.199         ,56.591              ,0.905 
0    ,1408 ,127      ,2048   ,75.92          ,74.458              ,1.02  
0    ,144  ,127      ,16     ,8.276          ,9.446               ,0.876 
0    ,1440 ,127      ,1024   ,51.278         ,55.935              ,0.917 
0    ,1440 ,127      ,2048   ,76.43          ,72.711              ,1.051 
0    ,1472 ,127      ,1024   ,51.257         ,56.579              ,0.906 
0    ,1472 ,127      ,2048   ,79.523         ,74.993              ,1.06  
0    ,1504 ,127      ,1024   ,51.191         ,56.314              ,0.909 
0    ,1504 ,127      ,2048   ,79.489         ,74.554              ,1.066 
0    ,1536 ,127      ,1024   ,51.204         ,55.617              ,0.921 
0    ,1536 ,127      ,2048   ,80.762         ,80.577              ,1.002 
0    ,1568 ,127      ,1024   ,51.231         ,55.206              ,0.928 
0    ,1568 ,127      ,2048   ,81.672         ,77.45               ,1.055 
0    ,16   ,127      ,112    ,8.028          ,7.947               ,1.01  
0    ,16   ,127      ,144    ,8.253          ,7.179               ,1.15  
0    ,16   ,127      ,16     ,7.711          ,8.782               ,0.878 
0    ,16   ,127      ,176    ,7.765          ,7.904               ,0.982 
0    ,16   ,127      ,208    ,7.985          ,7.606               ,1.05  
0    ,16   ,127      ,240    ,7.872          ,8.401               ,0.937 
0    ,16   ,127      ,272    ,7.991          ,7.467               ,1.07  
0    ,16   ,127      ,304    ,7.872          ,7.737               ,1.018 
0    ,16   ,127      ,336    ,7.981          ,7.474               ,1.068 
0    ,16   ,127      ,368    ,7.985          ,8.093               ,0.987 
0    ,16   ,127      ,400    ,8.134          ,7.181               ,1.133 
0    ,16   ,127      ,432    ,7.913          ,8.09                ,0.978 
0    ,16   ,127      ,464    ,7.873          ,8.062               ,0.976 
0    ,16   ,127      ,48     ,8.523          ,7.473               ,1.14  
0    ,16   ,127      ,496    ,7.872          ,7.469               ,1.054 
0    ,16   ,127      ,5000   ,8.014          ,7.552               ,1.061 
0    ,16   ,127      ,528    ,8.103          ,7.766               ,1.043 
0    ,16   ,127      ,560    ,7.77           ,7.495               ,1.037 
0    ,16   ,127      ,592    ,7.872          ,7.779               ,1.012 
0    ,16   ,127      ,624    ,7.877          ,7.929               ,0.993 
0    ,16   ,127      ,656    ,8.207          ,8.078               ,1.016 
0    ,16   ,127      ,688    ,8.081          ,8.243               ,0.98  
0    ,16   ,127      ,720    ,7.895          ,7.96                ,0.992 
0    ,16   ,127      ,80     ,7.766          ,8.232               ,0.943 
0    ,160  ,127      ,128    ,15.154         ,18.801              ,0.806 
0    ,160  ,127      ,256    ,20.798         ,22.397              ,0.929 
0    ,160  ,127      ,32     ,8.391          ,9.465               ,0.887 
0    ,160  ,127      ,512    ,28.453         ,27.335              ,1.041 
0    ,160  ,127      ,64     ,11.772         ,16.048              ,0.734 
0    ,1600 ,127      ,1024   ,51.248         ,56.536              ,0.906 
0    ,1600 ,127      ,2048   ,83.783         ,79.095              ,1.059 
0    ,1632 ,127      ,1024   ,51.209         ,55.354              ,0.925 
0    ,1632 ,127      ,2048   ,83.795         ,80.783              ,1.037 
0    ,1664 ,127      ,1024   ,51.231         ,55.463              ,0.924 
0    ,1664 ,127      ,2048   ,84.843         ,81.011              ,1.047 
0    ,1696 ,127      ,1024   ,51.224         ,55.806              ,0.918 
0    ,1696 ,127      ,2048   ,85.355         ,81.067              ,1.053 
0    ,1728 ,127      ,1024   ,51.24          ,55.575              ,0.922 
0    ,1728 ,127      ,2048   ,88.35          ,85.182              ,1.037 
0    ,176  ,127      ,16     ,7.848          ,9.112               ,0.861 
0    ,1760 ,127      ,2048   ,88.324         ,86.607              ,1.02  
0    ,1792 ,127      ,2048   ,89.051         ,89.539              ,0.995 
0    ,1824 ,127      ,2048   ,89.869         ,89.569              ,1.003 
0    ,1856 ,127      ,2048   ,92.812         ,92.592              ,1.002 
0    ,1888 ,127      ,2048   ,92.888         ,89.784              ,1.035 
0    ,192  ,127      ,128    ,16.134         ,19.141              ,0.843 
0    ,192  ,127      ,256    ,22.552         ,23.728              ,0.95  
0    ,192  ,127      ,32     ,7.771          ,8.878               ,0.875 
0    ,192  ,127      ,512    ,30.556         ,27.211              ,1.123 
0    ,192  ,127      ,64     ,11.901         ,15.859              ,0.75  
0    ,1920 ,127      ,2048   ,93.42          ,87.672              ,1.066 
0    ,1952 ,127      ,2048   ,94.412         ,89.887              ,1.05  
0    ,1984 ,127      ,2048   ,97.4           ,95.328              ,1.022 
0    ,2    ,127      ,1      ,8.372          ,8.943               ,0.936 
0    ,2    ,127      ,2      ,8.219          ,9.107               ,0.902 
0    ,2    ,127      ,3      ,8.136          ,9.115               ,0.893 
0    ,2    ,127      ,5000   ,8.244          ,7.468               ,1.104 
0    ,2016 ,127      ,2048   ,97.397         ,93.516              ,1.042 
0    ,2048 ,127      ,1344   ,65.155         ,65.144              ,1.0   
0    ,2048 ,127      ,1376   ,65.218         ,68.192              ,0.956 
0    ,2048 ,127      ,1408   ,65.129         ,69.788              ,0.933 
0    ,2048 ,127      ,1440   ,69.729         ,69.167              ,1.008 
0    ,2048 ,127      ,1472   ,69.858         ,70.173              ,0.996 
0    ,2048 ,127      ,1504   ,69.811         ,76.589              ,0.912 
0    ,2048 ,127      ,1536   ,69.755         ,71.866              ,0.971 
0    ,2048 ,127      ,1568   ,74.011         ,72.649              ,1.019 
0    ,2048 ,127      ,1600   ,74.101         ,73.454              ,1.009 
0    ,2048 ,127      ,1632   ,74.022         ,78.453              ,0.944 
0    ,2048 ,127      ,1664   ,74.022         ,76.724              ,0.965 
0    ,2048 ,127      ,1696   ,78.328         ,77.968              ,1.005 
0    ,2048 ,127      ,1728   ,78.165         ,79.1                ,0.988 
0    ,2048 ,127      ,1760   ,78.292         ,86.051              ,0.91  
0    ,2048 ,127      ,1792   ,78.238         ,82.325              ,0.95  
0    ,2048 ,127      ,1824   ,82.681         ,91.502              ,0.904 
0    ,2048 ,127      ,1856   ,82.708         ,90.495              ,0.914 
0    ,2048 ,127      ,1888   ,82.688         ,90.966              ,0.909 
0    ,2048 ,127      ,1920   ,82.953         ,88.146              ,0.941 
0    ,2048 ,127      ,1952   ,88.907         ,86.354              ,1.03  
0    ,2048 ,127      ,1984   ,87.401         ,89.249              ,0.979 
0    ,2048 ,127      ,2016   ,87.451         ,93.03               ,0.94  
0    ,2048 ,127      ,2048   ,87.085         ,87.77               ,0.992 
0    ,2048 ,127      ,2080   ,97.034         ,91.859              ,1.056 
0    ,2048 ,127      ,2112   ,97.241         ,89.463              ,1.087 
0    ,2048 ,127      ,2144   ,97.439         ,91.745              ,1.062 
0    ,2048 ,127      ,2176   ,97.365         ,91.434              ,1.065 
0    ,2048 ,127      ,2208   ,97.29          ,94.349              ,1.031 
0    ,2048 ,127      ,2240   ,97.514         ,94.828              ,1.028 
0    ,2048 ,127      ,2272   ,97.354         ,96.468              ,1.009 
0    ,2048 ,127      ,2304   ,97.463         ,95.07               ,1.025 
0    ,2048 ,127      ,2336   ,97.521         ,93.862              ,1.039 
0    ,2048 ,127      ,2368   ,97.458         ,91.991              ,1.059 
0    ,2048 ,127      ,2400   ,97.462         ,95.001              ,1.026 
0    ,2048 ,127      ,2432   ,97.431         ,94.729              ,1.029 
0    ,2048 ,127      ,2464   ,98.059         ,96.648              ,1.015 
0    ,2048 ,127      ,2496   ,98.201         ,94.299              ,1.041 
0    ,2048 ,127      ,2528   ,97.463         ,92.872              ,1.049 
0    ,2048 ,127      ,2560   ,97.224         ,92.746              ,1.048 
0    ,2048 ,127      ,2592   ,97.552         ,92.734              ,1.052 
0    ,2048 ,127      ,2624   ,97.225         ,94.323              ,1.031 
0    ,2048 ,127      ,2656   ,97.533         ,92.955              ,1.049 
0    ,2048 ,127      ,2688   ,97.286         ,92.563              ,1.051 
0    ,2048 ,127      ,2720   ,97.663         ,93.009              ,1.05  
0    ,2048 ,127      ,2752   ,97.566         ,92.544              ,1.054 
0    ,208  ,127      ,16     ,8.269          ,9.636               ,0.858 
0    ,2080 ,127      ,2048   ,87.327         ,88.36               ,0.988 
0    ,2112 ,127      ,2048   ,87.295         ,88.916              ,0.982 
0    ,2144 ,127      ,2048   ,87.303         ,88.041              ,0.992 
0    ,2176 ,127      ,2048   ,87.271         ,92.076              ,0.948 
0    ,2208 ,127      ,2048   ,87.277         ,88.826              ,0.983 
0    ,224  ,127      ,128    ,15.744         ,18.486              ,0.852 
0    ,224  ,127      ,256    ,25.117         ,24.473              ,1.026 
0    ,224  ,127      ,32     ,8.188          ,9.108               ,0.899 
0    ,224  ,127      ,512    ,30.598         ,27.231              ,1.124 
0    ,224  ,127      ,64     ,11.588         ,14.368              ,0.807 
0    ,2240 ,127      ,2048   ,87.264         ,92.115              ,0.947 
0    ,2272 ,127      ,2048   ,87.337         ,93.49               ,0.934 
0    ,2304 ,127      ,2048   ,89.4           ,88.821              ,1.007 
0    ,2336 ,127      ,2048   ,87.416         ,91.319              ,0.957 
0    ,2368 ,127      ,2048   ,87.567         ,91.481              ,0.957 
0    ,240  ,127      ,16     ,7.919          ,9.446               ,0.838 
0    ,2400 ,127      ,2048   ,87.283         ,91.766              ,0.951 
0    ,2432 ,127      ,2048   ,87.24          ,88.452              ,0.986 
0    ,2464 ,127      ,2048   ,87.265         ,89.14               ,0.979 
0    ,2496 ,127      ,2048   ,87.269         ,90.857              ,0.961 
0    ,2528 ,127      ,2048   ,87.281         ,88.188              ,0.99  
0    ,256  ,127      ,128    ,15.801         ,18.709              ,0.845 
0    ,256  ,127      ,160    ,16.748         ,19.81               ,0.845 
0    ,256  ,127      ,192    ,20.426         ,22.021              ,0.928 
0    ,256  ,127      ,224    ,21.854         ,25.135              ,0.869 
0    ,256  ,127      ,256    ,24.458         ,23.601              ,1.036 
0    ,256  ,127      ,288    ,27.505         ,26.207              ,1.05  
0    ,256  ,127      ,32     ,8.482          ,8.969               ,0.946 
0    ,256  ,127      ,320    ,32.108         ,29.16               ,1.101 
0    ,256  ,127      ,352    ,32.026         ,27.815              ,1.151 
0    ,256  ,127      ,384    ,32.05          ,27.73               ,1.156 
0    ,256  ,127      ,416    ,31.946         ,31.99               ,0.999 
0    ,256  ,127      ,448    ,32.078         ,32.051              ,1.001 
0    ,256  ,127      ,480    ,32.029         ,31.955              ,1.002 
0    ,256  ,127      ,5000   ,32.099         ,32.119              ,0.999 
0    ,256  ,127      ,512    ,32.106         ,31.981              ,1.004 
0    ,256  ,127      ,544    ,32.112         ,32.085              ,1.001 
0    ,256  ,127      ,576    ,32.102         ,32.016              ,1.003 
0    ,256  ,127      ,608    ,32.129         ,32.028              ,1.003 
0    ,256  ,127      ,64     ,11.543         ,16.009              ,0.721 
0    ,256  ,127      ,640    ,32.065         ,32.097              ,0.999 
0    ,256  ,127      ,672    ,32.034         ,31.884              ,1.005 
0    ,256  ,127      ,704    ,33.044         ,32.017              ,1.032 
0    ,256  ,127      ,736    ,32.079         ,31.959              ,1.004 
0    ,256  ,127      ,768    ,32.121         ,32.047              ,1.002 
0    ,256  ,127      ,800    ,32.118         ,31.976              ,1.004 
0    ,256  ,127      ,832    ,32.062         ,31.96               ,1.003 
0    ,256  ,127      ,864    ,32.031         ,31.882              ,1.005 
0    ,256  ,127      ,896    ,32.091         ,31.986              ,1.003 
0    ,256  ,127      ,928    ,32.001         ,31.985              ,1.001 
0    ,256  ,127      ,96     ,12.448         ,16.698              ,0.745 
0    ,256  ,127      ,960    ,32.025         ,32.087              ,0.998 
0    ,2560 ,127      ,2048   ,87.253         ,88.383              ,0.987 
0    ,2592 ,127      ,2048   ,87.302         ,88.626              ,0.985 
0    ,2624 ,127      ,2048   ,87.315         ,93.108              ,0.938 
0    ,2656 ,127      ,2048   ,88.187         ,88.823              ,0.993 
0    ,2688 ,127      ,2048   ,87.345         ,88.174              ,0.991 
0    ,272  ,127      ,16     ,7.93           ,9.626               ,0.824 
0    ,2720 ,127      ,2048   ,87.285         ,88.878              ,0.982 
0    ,2752 ,127      ,2048   ,87.233         ,88.579              ,0.985 
0    ,288  ,127      ,128    ,15.364         ,18.403              ,0.835 
0    ,288  ,127      ,256    ,24.552         ,24.252              ,1.012 
0    ,288  ,127      ,32     ,8.017          ,9.577               ,0.837 
0    ,288  ,127      ,512    ,33.191         ,32.165              ,1.032 
0    ,288  ,127      ,64     ,11.494         ,15.185              ,0.757 
0    ,3    ,127      ,2      ,8.285          ,8.966               ,0.924 
0    ,3    ,127      ,3      ,8.167          ,8.983               ,0.909 
0    ,3    ,127      ,4      ,8.01           ,9.069               ,0.883 
0    ,3    ,127      ,5000   ,8.128          ,7.766               ,1.047 
0    ,304  ,127      ,16     ,8.096          ,9.454               ,0.856 
0    ,32   ,127      ,128    ,12.311         ,16.153              ,0.762 
0    ,32   ,127      ,160    ,12.336         ,16.172              ,0.763 
0    ,32   ,127      ,192    ,12.305         ,13.279              ,0.927 
0    ,32   ,127      ,224    ,12.308         ,13.091              ,0.94  
0    ,32   ,127      ,256    ,12.632         ,13.381              ,0.944 
0    ,32   ,127      ,288    ,12.294         ,12.47               ,0.986 
0    ,32   ,127      ,32     ,7.66           ,8.781               ,0.872 
0    ,32   ,127      ,320    ,12.333         ,13.122              ,0.94  
0    ,32   ,127      ,352    ,12.339         ,12.464              ,0.99  
0    ,32   ,127      ,384    ,12.304         ,12.46               ,0.987 
0    ,32   ,127      ,416    ,12.336         ,13.574              ,0.909 
0    ,32   ,127      ,448    ,12.354         ,12.306              ,1.004 
0    ,32   ,127      ,480    ,12.304         ,12.304              ,1.0   
0    ,32   ,127      ,5000   ,12.306         ,13.123              ,0.938 
0    ,32   ,127      ,512    ,12.32          ,13.246              ,0.93  
0    ,32   ,127      ,544    ,12.34          ,13.222              ,0.933 
0    ,32   ,127      ,576    ,12.339         ,12.918              ,0.955 
0    ,32   ,127      ,608    ,12.343         ,12.805              ,0.964 
0    ,32   ,127      ,64     ,12.98          ,14.809              ,0.877 
0    ,32   ,127      ,640    ,12.304         ,12.471              ,0.987 
0    ,32   ,127      ,672    ,12.303         ,12.464              ,0.987 
0    ,32   ,127      ,704    ,12.3           ,12.804              ,0.961 
0    ,32   ,127      ,736    ,12.298         ,12.464              ,0.987 
0    ,32   ,127      ,96     ,12.424         ,14.9                ,0.834 
0    ,320  ,127      ,1024   ,35.324         ,31.788              ,1.111 
0    ,320  ,127      ,128    ,15.262         ,18.518              ,0.824 
0    ,320  ,127      ,256    ,24.669         ,25.17               ,0.98  
0    ,320  ,127      ,32     ,7.999          ,9.123               ,0.877 
0    ,320  ,127      ,512    ,35.3           ,31.824              ,1.109 
0    ,320  ,127      ,64     ,11.522         ,15.007              ,0.768 
0    ,336  ,127      ,16     ,7.981          ,8.948               ,0.892 
0    ,3392 ,127      ,4096   ,150.235        ,190.301             ,0.789 
0    ,3424 ,127      ,4096   ,144.605        ,190.131             ,0.761 
0    ,3456 ,127      ,4096   ,142.366        ,193.997             ,0.734 
0    ,3488 ,127      ,4096   ,145.561        ,196.579             ,0.74  
0    ,352  ,127      ,1024   ,35.334         ,31.77               ,1.112 
0    ,352  ,127      ,128    ,16.03          ,18.485              ,0.867 
0    ,352  ,127      ,256    ,24.505         ,24.607              ,0.996 
0    ,352  ,127      ,32     ,8.016          ,9.285               ,0.863 
0    ,352  ,127      ,512    ,35.297         ,31.777              ,1.111 
0    ,352  ,127      ,64     ,11.594         ,16.022              ,0.724 
0    ,3520 ,127      ,4096   ,149.189        ,187.86              ,0.794 
0    ,3552 ,127      ,4096   ,148.896        ,189.592             ,0.785 
0    ,3584 ,127      ,4096   ,146.434        ,195.891             ,0.748 
0    ,3616 ,127      ,4096   ,149.628        ,194.825             ,0.768 
0    ,3648 ,127      ,4096   ,153.47         ,190.168             ,0.807 
0    ,368  ,127      ,16     ,8.17           ,9.113               ,0.897 
0    ,3680 ,127      ,4096   ,155.436        ,191.619             ,0.811 
0    ,3712 ,127      ,4096   ,149.822        ,203.939             ,0.735 
0    ,3744 ,127      ,4096   ,153.881        ,196.519             ,0.783 
0    ,3776 ,127      ,4096   ,158.302        ,200.946             ,0.788 
0    ,3808 ,127      ,4096   ,158.081        ,209.14              ,0.756 
0    ,384  ,127      ,1024   ,37.181         ,36.796              ,1.01  
0    ,384  ,127      ,128    ,16.028         ,18.65               ,0.859 
0    ,384  ,127      ,256    ,24.866         ,24.507              ,1.015 
0    ,384  ,127      ,32     ,8.429          ,8.943               ,0.943 
0    ,384  ,127      ,512    ,37.171         ,32.643              ,1.139 
0    ,384  ,127      ,64     ,11.473         ,15.68               ,0.732 
0    ,3840 ,127      ,4096   ,155.507        ,200.042             ,0.777 
0    ,3872 ,127      ,4096   ,158.122        ,199.468             ,0.793 
0    ,3904 ,127      ,4096   ,163.552        ,199.163             ,0.821 
0    ,3936 ,127      ,4096   ,162.695        ,204.503             ,0.796 
0    ,3968 ,127      ,4096   ,173.435        ,177.618             ,0.976 
0    ,4    ,127      ,3      ,8.129          ,9.283               ,0.876 
0    ,4    ,127      ,4      ,7.918          ,9.049               ,0.875 
0    ,4    ,127      ,5      ,8.122          ,9.107               ,0.892 
0    ,4    ,127      ,5000   ,7.665          ,7.321               ,1.047 
0    ,400  ,127      ,16     ,8.183          ,8.943               ,0.915 
0    ,4000 ,127      ,4096   ,182.372        ,176.806             ,1.031 
0    ,4032 ,127      ,4096   ,173.531        ,176.896             ,0.981 
0    ,4064 ,127      ,4096   ,170.429        ,188.202             ,0.906 
0    ,4096 ,127      ,3392   ,134.112        ,159.888             ,0.839 
0    ,4096 ,127      ,3424   ,134.255        ,171.495             ,0.783 
0    ,4096 ,127      ,3456   ,134.558        ,165.724             ,0.812 
0    ,4096 ,127      ,3488   ,138.429        ,166.295             ,0.832 
0    ,4096 ,127      ,3520   ,138.508        ,163.608             ,0.847 
0    ,4096 ,127      ,3552   ,138.455        ,167.833             ,0.825 
0    ,4096 ,127      ,3584   ,139.393        ,165.671             ,0.841 
0    ,4096 ,127      ,3616   ,142.563        ,170.198             ,0.838 
0    ,4096 ,127      ,3648   ,142.746        ,169.878             ,0.84  
0    ,4096 ,127      ,3680   ,142.798        ,171.673             ,0.832 
0    ,4096 ,127      ,3712   ,142.619        ,173.275             ,0.823 
0    ,4096 ,127      ,3744   ,147.268        ,170.217             ,0.865 
0    ,4096 ,127      ,3776   ,147.036        ,169.047             ,0.87  
0    ,4096 ,127      ,3808   ,146.977        ,172.515             ,0.852 
0    ,4096 ,127      ,3840   ,147.399        ,175.952             ,0.838 
0    ,4096 ,127      ,3872   ,151.254        ,178.702             ,0.846 
0    ,4096 ,127      ,3904   ,151.309        ,177.89              ,0.851 
0    ,4096 ,127      ,3936   ,151.626        ,181.201             ,0.837 
0    ,4096 ,127      ,3968   ,151.281        ,177.809             ,0.851 
0    ,4096 ,127      ,4000   ,155.566        ,176.872             ,0.88  
0    ,4096 ,127      ,4032   ,156.314        ,178.469             ,0.876 
0    ,4096 ,127      ,4064   ,156.323        ,191.263             ,0.817 
0    ,4096 ,127      ,4096   ,155.278        ,175.579             ,0.884 
0    ,4096 ,127      ,4128   ,163.473        ,187.974             ,0.87  
0    ,4096 ,127      ,4160   ,166.296        ,182.482             ,0.911 
0    ,4096 ,127      ,4192   ,162.559        ,178.45              ,0.911 
0    ,4096 ,127      ,4224   ,164.064        ,179.153             ,0.916 
0    ,4096 ,127      ,4256   ,181.209        ,212.238             ,0.854 
0    ,4096 ,127      ,4288   ,167.509        ,206.898             ,0.81  
0    ,4096 ,127      ,4320   ,162.726        ,210.745             ,0.772 
0    ,4096 ,127      ,4352   ,163.294        ,215.134             ,0.759 
0    ,4096 ,127      ,4384   ,163.785        ,208.764             ,0.785 
0    ,4096 ,127      ,4416   ,164.439        ,207.951             ,0.791 
0    ,4096 ,127      ,4448   ,163.662        ,206.41              ,0.793 
0    ,4096 ,127      ,4480   ,164.414        ,205.231             ,0.801 
0    ,4096 ,127      ,4512   ,163.637        ,214.655             ,0.762 
0    ,4096 ,127      ,4544   ,162.945        ,207.81              ,0.784 
0    ,4096 ,127      ,4576   ,162.81         ,212.317             ,0.767 
0    ,4096 ,127      ,4608   ,167.929        ,207.966             ,0.807 
0    ,4096 ,127      ,4640   ,162.01         ,207.893             ,0.779 
0    ,4096 ,127      ,4672   ,172.59         ,209.725             ,0.823 
0    ,4096 ,127      ,4704   ,168.842        ,209.017             ,0.808 
0    ,4096 ,127      ,4736   ,172.708        ,221.116             ,0.781 
0    ,4096 ,127      ,4768   ,163.522        ,209.261             ,0.781 
0    ,4096 ,127      ,4800   ,162.52         ,213.294             ,0.762 
0    ,4128 ,127      ,4096   ,155.478        ,182.694             ,0.851 
0    ,416  ,127      ,1024   ,38.324         ,37.116              ,1.033 
0    ,416  ,127      ,128    ,15.347         ,18.663              ,0.822 
0    ,416  ,127      ,256    ,24.518         ,24.291              ,1.009 
0    ,416  ,127      ,32     ,8.096          ,9.275               ,0.873 
0    ,416  ,127      ,512    ,38.394         ,34.173              ,1.124 
0    ,416  ,127      ,64     ,11.255         ,14.832              ,0.759 
0    ,4160 ,127      ,4096   ,155.74         ,184.944             ,0.842 
0    ,4192 ,127      ,4096   ,155.272        ,183.359             ,0.847 
0    ,4224 ,127      ,4096   ,155.427        ,181.21              ,0.858 
0    ,4256 ,127      ,4096   ,155.675        ,180.996             ,0.86  
0    ,4288 ,127      ,4096   ,156.771        ,179.921             ,0.871 
0    ,432  ,127      ,16     ,8.512          ,8.949               ,0.951 
0    ,4320 ,127      ,4096   ,157.846        ,181.116             ,0.872 
0    ,4352 ,127      ,4096   ,155.56         ,185.393             ,0.839 
0    ,4384 ,127      ,4096   ,155.489        ,186.039             ,0.836 
0    ,4416 ,127      ,4096   ,155.707        ,182.402             ,0.854 
0    ,4448 ,127      ,4096   ,155.77         ,181.283             ,0.859 
0    ,448  ,127      ,1024   ,40.651         ,36.497              ,1.114 
0    ,448  ,127      ,128    ,15.182         ,19.331              ,0.785 
0    ,448  ,127      ,256    ,24.505         ,24.898              ,0.984 
0    ,448  ,127      ,32     ,7.933          ,8.788               ,0.903 
0    ,448  ,127      ,512    ,40.662         ,37.111              ,1.096 
0    ,448  ,127      ,64     ,11.556         ,16.163              ,0.715 
0    ,4480 ,127      ,4096   ,156.429        ,184.441             ,0.848 
0    ,4512 ,127      ,4096   ,155.53         ,180.857             ,0.86  
0    ,4544 ,127      ,4096   ,156.2          ,183.916             ,0.849 
0    ,4576 ,127      ,4096   ,155.654        ,180.911             ,0.86  
0    ,4608 ,127      ,4096   ,155.66         ,185.312             ,0.84  
0    ,464  ,127      ,16     ,8.127          ,9.619               ,0.845 
0    ,4640 ,127      ,4096   ,155.667        ,179.762             ,0.866 
0    ,4672 ,127      ,4096   ,155.61         ,186.585             ,0.834 
0    ,4704 ,127      ,4096   ,155.664        ,189.499             ,0.821 
0    ,4736 ,127      ,4096   ,155.896        ,187.151             ,0.833 
0    ,4768 ,127      ,4096   ,155.663        ,185.39              ,0.84  
0    ,48   ,127      ,16     ,8.181          ,8.943               ,0.915 
0    ,480  ,127      ,1024   ,40.736         ,36.551              ,1.115 
0    ,480  ,127      ,128    ,15.69          ,18.342              ,0.855 
0    ,480  ,127      ,256    ,24.684         ,24.586              ,1.004 
0    ,480  ,127      ,32     ,8.127          ,9.456               ,0.859 
0    ,480  ,127      ,512    ,40.643         ,37.968              ,1.07  
0    ,480  ,127      ,64     ,11.367         ,15.192              ,0.748 
0    ,4800 ,127      ,4096   ,155.66         ,185.849             ,0.838 
0    ,496  ,127      ,16     ,8.395          ,9.28                ,0.905 
0    ,5    ,127      ,4      ,8.201          ,9.108               ,0.9   
0    ,5    ,127      ,5      ,8.085          ,9.107               ,0.888 
0    ,5    ,127      ,5000   ,8.128          ,7.622               ,1.066 
0    ,5    ,127      ,6      ,8.156          ,9.28                ,0.879 
0    ,5000 ,127      ,1      ,8.628          ,8.806               ,0.98  
0    ,5000 ,127      ,1024   ,51.209         ,56.867              ,0.901 
0    ,5000 ,127      ,128    ,17.026         ,18.619              ,0.914 
0    ,5000 ,127      ,16     ,8.186          ,9.38                ,0.873 
0    ,5000 ,127      ,2      ,8.136          ,9.123               ,0.892 
0    ,5000 ,127      ,256    ,24.936         ,24.81               ,1.005 
0    ,5000 ,127      ,3      ,8.277          ,9.624               ,0.86  
0    ,5000 ,127      ,32     ,8.417          ,9.114               ,0.924 
0    ,5000 ,127      ,4      ,7.665          ,8.788               ,0.872 
0    ,5000 ,127      ,5      ,7.872          ,8.943               ,0.88  
0    ,5000 ,127      ,512    ,31.663         ,37.085              ,0.854 
0    ,5000 ,127      ,6      ,8.644          ,9.052               ,0.955 
0    ,5000 ,127      ,64     ,11.542         ,15.94               ,0.724 
0    ,5000 ,127      ,7      ,8.02           ,9.011               ,0.89  
0    ,5000 ,127      ,8      ,8.026          ,8.952               ,0.897 
0    ,512  ,127      ,1024   ,41.887         ,41.549              ,1.008 
0    ,512  ,127      ,1056   ,41.851         ,41.465              ,1.009 
0    ,512  ,127      ,1088   ,41.795         ,42.078              ,0.993 
0    ,512  ,127      ,1120   ,41.903         ,41.43               ,1.011 
0    ,512  ,127      ,1152   ,42.096         ,41.437              ,1.016 
0    ,512  ,127      ,1184   ,41.949         ,41.367              ,1.014 
0    ,512  ,127      ,1216   ,42.025         ,41.343              ,1.016 
0    ,512  ,127      ,128    ,16.134         ,18.676              ,0.864 
0    ,512  ,127      ,160    ,16.73          ,19.325              ,0.866 
0    ,512  ,127      ,192    ,20.227         ,22.514              ,0.898 
0    ,512  ,127      ,224    ,21.703         ,23.175              ,0.936 
0    ,512  ,127      ,256    ,24.883         ,25.43               ,0.978 
0    ,512  ,127      ,288    ,26.298         ,26.515              ,0.992 
0    ,512  ,127      ,32     ,8.456          ,9.142               ,0.925 
0    ,512  ,127      ,320    ,26.787         ,30.445              ,0.88  
0    ,512  ,127      ,352    ,26.768         ,31.235              ,0.857 
0    ,512  ,127      ,384    ,26.813         ,32.966              ,0.813 
0    ,512  ,127      ,416    ,31.659         ,32.359              ,0.978 
0    ,512  ,127      ,448    ,31.659         ,34.141              ,0.927 
0    ,512  ,127      ,480    ,31.653         ,33.596              ,0.942 
0    ,512  ,127      ,5000   ,41.891         ,41.417              ,1.011 
0    ,512  ,127      ,512    ,31.538         ,36.786              ,0.857 
0    ,512  ,127      ,544    ,41.989         ,37.363              ,1.124 
0    ,512  ,127      ,576    ,42.276         ,37.994              ,1.113 
0    ,512  ,127      ,608    ,42.033         ,37.045              ,1.135 
0    ,512  ,127      ,64     ,11.594         ,15.701              ,0.738 
0    ,512  ,127      ,640    ,41.864         ,37.692              ,1.111 
0    ,512  ,127      ,672    ,41.934         ,41.474              ,1.011 
0    ,512  ,127      ,704    ,41.944         ,41.419              ,1.013 
0    ,512  ,127      ,736    ,41.991         ,41.586              ,1.01  
0    ,512  ,127      ,768    ,41.921         ,41.356              ,1.014 
0    ,512  ,127      ,800    ,41.983         ,41.394              ,1.014 
0    ,512  ,127      ,832    ,42.518         ,41.454              ,1.026 
0    ,512  ,127      ,864    ,41.914         ,41.342              ,1.014 
0    ,512  ,127      ,896    ,41.8           ,41.642              ,1.004 
0    ,512  ,127      ,928    ,42.012         ,41.354              ,1.016 
0    ,512  ,127      ,96     ,12.48          ,16.392              ,0.761 
0    ,512  ,127      ,960    ,41.87          ,43.373              ,0.965 
0    ,512  ,127      ,992    ,41.867         ,41.742              ,1.003 
0    ,528  ,127      ,16     ,8.391          ,9.293               ,0.903 
0    ,544  ,127      ,1024   ,43.101         ,41.449              ,1.04  
0    ,544  ,127      ,128    ,15.444         ,19.018              ,0.812 
0    ,544  ,127      ,256    ,24.483         ,25.001              ,0.979 
0    ,544  ,127      ,32     ,8.179          ,9.353               ,0.874 
0    ,544  ,127      ,512    ,31.643         ,36.862              ,0.858 
0    ,544  ,127      ,64     ,11.256         ,15.206              ,0.74  
0    ,560  ,127      ,16     ,7.766          ,9.446               ,0.822 
0    ,576  ,127      ,1024   ,45.631         ,41.479              ,1.1   
0    ,576  ,127      ,128    ,15.526         ,19.48               ,0.797 
0    ,576  ,127      ,256    ,24.474         ,24.807              ,0.987 
0    ,576  ,127      ,32     ,8.244          ,9.45                ,0.872 
0    ,576  ,127      ,512    ,31.66          ,37.825              ,0.837 
0    ,576  ,127      ,64     ,11.602         ,15.611              ,0.743 
0    ,592  ,127      ,16     ,7.991          ,9.556               ,0.836 
0    ,6    ,127      ,5      ,8.498          ,9.134               ,0.93  
0    ,6    ,127      ,5000   ,7.999          ,7.767               ,1.03  
0    ,6    ,127      ,6      ,8.148          ,8.948               ,0.911 
0    ,6    ,127      ,7      ,7.877          ,9.218               ,0.855 
0    ,608  ,127      ,1024   ,45.647         ,41.482              ,1.1   
0    ,608  ,127      ,128    ,15.588         ,19.387              ,0.804 
0    ,608  ,127      ,256    ,24.653         ,24.723              ,0.997 
0    ,608  ,127      ,32     ,8.028          ,8.953               ,0.897 
0    ,608  ,127      ,512    ,31.66          ,37.302              ,0.849 
0    ,608  ,127      ,64     ,11.819         ,14.897              ,0.793 
0    ,624  ,127      ,16     ,8.175          ,9.101               ,0.898 
0    ,64   ,127      ,128    ,14.215         ,18.247              ,0.779 
0    ,64   ,127      ,160    ,14.242         ,18.062              ,0.788 
0    ,64   ,127      ,192    ,14.176         ,14.246              ,0.995 
0    ,64   ,127      ,224    ,14.199         ,14.057              ,1.01  
0    ,64   ,127      ,256    ,14.202         ,13.852              ,1.025 
0    ,64   ,127      ,288    ,14.208         ,14.229              ,0.999 
0    ,64   ,127      ,32     ,8.243          ,9.068               ,0.909 
0    ,64   ,127      ,320    ,14.18          ,14.165              ,1.001 
0    ,64   ,127      ,352    ,14.164         ,14.056              ,1.008 
0    ,64   ,127      ,384    ,14.185         ,13.535              ,1.048 
0    ,64   ,127      ,416    ,14.203         ,14.318              ,0.992 
0    ,64   ,127      ,448    ,14.183         ,13.366              ,1.061 
0    ,64   ,127      ,480    ,14.178         ,13.852              ,1.024 
0    ,64   ,127      ,5000   ,14.273         ,14.58               ,0.979 
0    ,64   ,127      ,512    ,14.219         ,14.24               ,0.999 
0    ,64   ,127      ,544    ,14.156         ,13.952              ,1.015 
0    ,64   ,127      ,576    ,14.158         ,14.481              ,0.978 
0    ,64   ,127      ,608    ,14.189         ,14.159              ,1.002 
0    ,64   ,127      ,64     ,11.14          ,14.05               ,0.793 
0    ,64   ,127      ,640    ,14.171         ,13.543              ,1.046 
0    ,64   ,127      ,672    ,14.193         ,13.751              ,1.032 
0    ,64   ,127      ,704    ,14.182         ,13.959              ,1.016 
0    ,64   ,127      ,736    ,14.171         ,14.055              ,1.008 
0    ,64   ,127      ,768    ,14.157         ,14.204              ,0.997 
0    ,64   ,127      ,96     ,14.456         ,17.141              ,0.843 
0    ,640  ,127      ,1024   ,47.142         ,46.073              ,1.023 
0    ,640  ,127      ,128    ,15.872         ,18.998              ,0.835 
0    ,640  ,127      ,256    ,24.671         ,24.487              ,1.008 
0    ,640  ,127      ,32     ,8.396          ,9.055               ,0.927 
0    ,640  ,127      ,512    ,31.646         ,37.804              ,0.837 
0    ,640  ,127      ,64     ,11.552         ,14.921              ,0.774 
0    ,656  ,127      ,16     ,8.022          ,9.28                ,0.864 
0    ,672  ,127      ,1024   ,47.939         ,46.177              ,1.038 
0    ,672  ,127      ,128    ,16.03          ,19.0                ,0.844 
0    ,672  ,127      ,256    ,24.487         ,25.587              ,0.957 
0    ,672  ,127      ,32     ,7.765          ,9.282               ,0.837 
0    ,672  ,127      ,512    ,31.655         ,37.045              ,0.855 
0    ,672  ,127      ,64     ,11.707         ,15.716              ,0.745 
0    ,688  ,127      ,16     ,8.176          ,9.109               ,0.898 
0    ,7    ,127      ,5000   ,7.908          ,7.778               ,1.017 
0    ,7    ,127      ,6      ,8.239          ,9.113               ,0.904 
0    ,7    ,127      ,7      ,8.091          ,8.943               ,0.905 
0    ,7    ,127      ,8      ,8.495          ,9.113               ,0.932 
0    ,704  ,127      ,1024   ,50.512         ,46.444              ,1.088 
0    ,704  ,127      ,128    ,15.947         ,18.644              ,0.855 
0    ,704  ,127      ,256    ,24.475         ,24.618              ,0.994 
0    ,704  ,127      ,32     ,8.339          ,8.943               ,0.932 
0    ,704  ,127      ,512    ,31.672         ,37.016              ,0.856 
0    ,704  ,127      ,64     ,11.676         ,16.287              ,0.717 
0    ,720  ,127      ,16     ,8.073          ,9.451               ,0.854 
0    ,736  ,127      ,1024   ,50.557         ,46.873              ,1.079 
0    ,736  ,127      ,128    ,15.519         ,19.137              ,0.811 
0    ,736  ,127      ,256    ,24.493         ,24.042              ,1.019 
0    ,736  ,127      ,32     ,7.963          ,9.314               ,0.855 
0    ,736  ,127      ,512    ,31.674         ,37.365              ,0.848 
0    ,736  ,127      ,64     ,11.588         ,15.9                ,0.729 
0    ,7488 ,127      ,8192   ,328.179        ,308.263             ,1.065 
0    ,7520 ,127      ,8192   ,329.61         ,306.088             ,1.077 
0    ,7552 ,127      ,8192   ,337.338        ,308.477             ,1.094 
0    ,7584 ,127      ,8192   ,331.688        ,309.124             ,1.073 
0    ,7616 ,127      ,8192   ,336.799        ,308.588             ,1.091 
0    ,7648 ,127      ,8192   ,335.838        ,309.37              ,1.086 
0    ,768  ,127      ,1024   ,51.751         ,51.583              ,1.003 
0    ,768  ,127      ,128    ,15.601         ,19.449              ,0.802 
0    ,768  ,127      ,256    ,24.518         ,24.414              ,1.004 
0    ,768  ,127      ,512    ,31.647         ,36.928              ,0.857 
0    ,768  ,127      ,64     ,11.269         ,15.894              ,0.709 
0    ,7680 ,127      ,8192   ,337.088        ,310.192             ,1.087 
0    ,7712 ,127      ,8192   ,335.836        ,312.243             ,1.076 
0    ,7744 ,127      ,8192   ,341.67         ,313.952             ,1.088 
0    ,7776 ,127      ,8192   ,337.677        ,312.114             ,1.082 
0    ,7808 ,127      ,8192   ,338.394        ,313.933             ,1.078 
0    ,7840 ,127      ,8192   ,337.827        ,318.984             ,1.059 
0    ,7872 ,127      ,8192   ,338.106        ,315.827             ,1.071 
0    ,7904 ,127      ,8192   ,341.94         ,319.556             ,1.07  
0    ,7936 ,127      ,8192   ,345.793        ,319.103             ,1.084 
0    ,7968 ,127      ,8192   ,343.159        ,323.411             ,1.061 
0    ,8    ,127      ,5000   ,8.327          ,7.9                 ,1.054 
0    ,80   ,127      ,16     ,7.876          ,8.949               ,0.88  
0    ,800  ,127      ,1024   ,53.244         ,52.011              ,1.024 
0    ,800  ,127      ,128    ,15.693         ,19.293              ,0.813 
0    ,800  ,127      ,256    ,24.473         ,24.437              ,1.001 
0    ,800  ,127      ,512    ,31.654         ,36.836              ,0.859 
0    ,8000 ,127      ,8192   ,344.845        ,321.799             ,1.072 
0    ,8032 ,127      ,8192   ,343.376        ,322.474             ,1.065 
0    ,8064 ,127      ,8192   ,326.536        ,296.036             ,1.103 
0    ,8096 ,127      ,8192   ,328.024        ,301.152             ,1.089 
0    ,8128 ,127      ,8192   ,331.53         ,297.397             ,1.115 
0    ,8160 ,127      ,8192   ,331.008        ,303.453             ,1.091 
0    ,832  ,127      ,1024   ,57.15          ,51.405              ,1.112 
0    ,832  ,127      ,128    ,15.531         ,19.35               ,0.803 
0    ,832  ,127      ,256    ,24.545         ,24.501              ,1.002 
0    ,832  ,127      ,512    ,31.643         ,38.15               ,0.829 
0    ,864  ,127      ,1024   ,55.392         ,51.462              ,1.076 
0    ,864  ,127      ,256    ,24.472         ,24.553              ,0.997 
0    ,864  ,127      ,512    ,31.672         ,37.169              ,0.852 
0    ,896  ,127      ,1024   ,56.578         ,52.206              ,1.084 
0    ,896  ,127      ,256    ,24.485         ,24.586              ,0.996 
0    ,896  ,127      ,512    ,31.659         ,37.055              ,0.854 
0    ,928  ,127      ,1024   ,58.075         ,54.221              ,1.071 
0    ,928  ,127      ,256    ,24.829         ,24.799              ,1.001 
0    ,928  ,127      ,512    ,31.663         ,36.843              ,0.859 
0    ,96   ,127      ,128    ,17.064         ,17.918              ,0.952 
0    ,96   ,127      ,256    ,16.1           ,15.861              ,1.015 
0    ,96   ,127      ,32     ,8.507          ,9.108               ,0.934 
0    ,96   ,127      ,512    ,15.739         ,15.943              ,0.987 
0    ,96   ,127      ,64     ,11.63          ,14.875              ,0.782 
0    ,960  ,127      ,1024   ,60.301         ,56.801              ,1.062 
0    ,960  ,127      ,256    ,24.872         ,25.147              ,0.989 
0    ,960  ,127      ,512    ,31.651         ,36.958              ,0.856 
0    ,992  ,127      ,1024   ,60.336         ,57.422              ,1.051 
0    ,992  ,127      ,512    ,31.738         ,36.905              ,0.86  
1    ,1    ,127      ,0      ,8.786          ,10.542              ,0.833 
1    ,1    ,127      ,1      ,8.823          ,9.62                ,0.917 
1    ,1    ,127      ,128    ,8.579          ,8.57                ,1.001 
1    ,1    ,127      ,2      ,7.938          ,9.048               ,0.877 
1    ,1    ,127      ,5000   ,8.662          ,7.751               ,1.118 
1    ,1024 ,127      ,5000   ,61.941         ,61.077              ,1.014 
1    ,128  ,127      ,1      ,8.993          ,8.961               ,1.004 
1    ,128  ,127      ,5000   ,17.592         ,16.919              ,1.04  
1    ,16   ,127      ,5000   ,8.352          ,7.627               ,1.095 
1    ,2    ,127      ,1      ,8.51           ,8.819               ,0.965 
1    ,256  ,127      ,5000   ,32.189         ,32.041              ,1.005 
1    ,32   ,127      ,5000   ,12.297         ,12.844              ,0.957 
1    ,4    ,127      ,5000   ,8.183          ,7.857               ,1.041 
1    ,5000 ,127      ,1      ,8.134          ,9.275               ,0.877 
1    ,5000 ,127      ,1024   ,55.756         ,57.004              ,0.978 
1    ,5000 ,127      ,128    ,17.023         ,18.302              ,0.93  
1    ,5000 ,127      ,16     ,7.98           ,9.495               ,0.84  
1    ,5000 ,127      ,256    ,26.307         ,26.073              ,1.009 
1    ,5000 ,127      ,32     ,8.057          ,9.253               ,0.871 
1    ,5000 ,127      ,4      ,8.057          ,9.275               ,0.869 
1    ,5000 ,127      ,512    ,36.553         ,38.267              ,0.955 
1    ,5000 ,127      ,64     ,12.699         ,17.646              ,0.72  
1    ,5000 ,127      ,8      ,8.276          ,9.449               ,0.876 
1    ,512  ,127      ,5000   ,41.962         ,41.559              ,1.01  
1    ,64   ,127      ,5000   ,14.202         ,14.256              ,0.996 
1    ,8    ,127      ,5000   ,8.383          ,7.613               ,1.101 
2    ,2    ,127      ,1      ,8.013          ,8.943               ,0.896 
2    ,2    ,127      ,2      ,8.014          ,8.943               ,0.896 
2    ,2    ,127      ,3      ,8.214          ,8.953               ,0.917 
2    ,2    ,127      ,5000   ,8.133          ,7.479               ,1.087 
2    ,5000 ,127      ,2      ,7.872          ,9.802               ,0.803 
3    ,3    ,127      ,2      ,8.54           ,8.965               ,0.953 
3    ,3    ,127      ,3      ,8.26           ,8.943               ,0.924 
3    ,3    ,127      ,4      ,8.314          ,8.966               ,0.927 
3    ,3    ,127      ,5000   ,8.127          ,7.177               ,1.132 
3    ,5000 ,127      ,3      ,7.952          ,9.648               ,0.824 
32   ,1    ,127      ,128    ,8.566          ,8.881               ,0.964 
32   ,1    ,127      ,2      ,8.76           ,9.099               ,0.963 
32   ,128  ,127      ,1      ,8.717          ,8.944               ,0.975 
32   ,2    ,127      ,1      ,8.889          ,9.109               ,0.976 
33   ,1    ,127      ,128    ,8.826          ,8.419               ,1.048 
33   ,1    ,127      ,2      ,8.587          ,9.136               ,0.94  
33   ,128  ,127      ,1      ,8.82           ,8.973               ,0.983 
33   ,2    ,127      ,1      ,8.91           ,8.952               ,0.995 
4    ,4    ,127      ,3      ,8.127          ,8.943               ,0.909 
4    ,4    ,127      ,4      ,7.993          ,8.948               ,0.893 
4    ,4    ,127      ,5      ,8.6            ,9.107               ,0.944 
4    ,4    ,127      ,5000   ,8.232          ,7.626               ,1.079 
4    ,5000 ,127      ,4      ,7.77           ,9.413               ,0.825 
5    ,5    ,127      ,4      ,7.872          ,9.446               ,0.833 
5    ,5    ,127      ,5      ,7.872          ,8.915               ,0.883 
5    ,5    ,127      ,5000   ,7.98           ,7.329               ,1.089 
5    ,5    ,127      ,6      ,8.178          ,9.446               ,0.866 
5    ,5000 ,127      ,5      ,8.255          ,9.456               ,0.873 
6    ,5000 ,127      ,6      ,8.068          ,9.62                ,0.839 
6    ,6    ,127      ,5      ,7.77           ,8.943               ,0.869 
6    ,6    ,127      ,5000   ,8.362          ,7.463               ,1.12  
6    ,6    ,127      ,6      ,7.987          ,8.949               ,0.893 
6    ,6    ,127      ,7      ,8.097          ,9.107               ,0.889 
64   ,1024 ,127      ,1024   ,64.971         ,55.783              ,1.165 
64   ,1024 ,127      ,1056   ,65.377         ,63.04               ,1.037 
64   ,1024 ,127      ,1088   ,65.398         ,62.278              ,1.05  
64   ,1024 ,127      ,1120   ,65.333         ,61.871              ,1.056 
64   ,1024 ,127      ,1152   ,65.387         ,60.977              ,1.072 
64   ,1024 ,127      ,1184   ,65.403         ,61.454              ,1.064 
64   ,1024 ,127      ,1216   ,65.416         ,62.302              ,1.05  
64   ,1024 ,127      ,1248   ,65.408         ,60.884              ,1.074 
64   ,1024 ,127      ,1280   ,65.39          ,62.096              ,1.053 
64   ,1024 ,127      ,1312   ,65.439         ,60.899              ,1.075 
64   ,1024 ,127      ,1344   ,65.408         ,61.893              ,1.057 
64   ,1024 ,127      ,1376   ,65.416         ,61.402              ,1.065 
64   ,1024 ,127      ,1408   ,65.419         ,61.418              ,1.065 
64   ,1024 ,127      ,1440   ,65.391         ,62.334              ,1.049 
64   ,1024 ,127      ,1472   ,65.463         ,61.948              ,1.057 
64   ,1024 ,127      ,1504   ,65.411         ,62.018              ,1.055 
64   ,1024 ,127      ,1536   ,65.417         ,61.016              ,1.072 
64   ,1024 ,127      ,1568   ,65.892         ,61.578              ,1.07  
64   ,1024 ,127      ,1600   ,65.384         ,61.727              ,1.059 
64   ,1024 ,127      ,1632   ,65.415         ,60.985              ,1.073 
64   ,1024 ,127      ,1664   ,65.416         ,61.007              ,1.072 
64   ,1024 ,127      ,1696   ,65.424         ,60.987              ,1.073 
64   ,1024 ,127      ,1728   ,65.373         ,61.051              ,1.071 
64   ,1024 ,127      ,320    ,26.766         ,33.089              ,0.809 
64   ,1024 ,127      ,352    ,31.673         ,32.153              ,0.985 
64   ,1024 ,127      ,384    ,31.643         ,33.68               ,0.94  
64   ,1024 ,127      ,416    ,31.774         ,34.205              ,0.929 
64   ,1024 ,127      ,448    ,31.646         ,36.928              ,0.857 
64   ,1024 ,127      ,480    ,36.544         ,38.926              ,0.939 
64   ,1024 ,127      ,512    ,36.515         ,38.739              ,0.943 
64   ,1024 ,127      ,544    ,36.517         ,40.344              ,0.905 
64   ,1024 ,127      ,576    ,36.509         ,42.023              ,0.869 
64   ,1024 ,127      ,608    ,41.605         ,40.212              ,1.035 
64   ,1024 ,127      ,640    ,41.74          ,44.206              ,0.944 
64   ,1024 ,127      ,672    ,41.64          ,44.05               ,0.945 
64   ,1024 ,127      ,704    ,41.663         ,46.577              ,0.894 
64   ,1024 ,127      ,736    ,46.661         ,47.867              ,0.975 
64   ,1024 ,127      ,768    ,46.684         ,48.378              ,0.965 
64   ,1024 ,127      ,800    ,46.629         ,50.581              ,0.922 
64   ,1024 ,127      ,832    ,46.701         ,52.198              ,0.895 
64   ,1024 ,127      ,864    ,51.219         ,50.305              ,1.018 
64   ,1024 ,127      ,896    ,51.27          ,52.707              ,0.973 
64   ,1024 ,127      ,928    ,51.218         ,54.9                ,0.933 
64   ,1024 ,127      ,960    ,53.183         ,55.933              ,0.951 
64   ,1024 ,127      ,992    ,65.381         ,58.381              ,1.12  
64   ,1056 ,127      ,1024   ,65.39          ,56.236              ,1.163 
64   ,1056 ,127      ,512    ,36.552         ,38.571              ,0.948 
64   ,1088 ,127      ,1024   ,55.746         ,58.405              ,0.954 
64   ,1088 ,127      ,512    ,36.516         ,38.472              ,0.949 
64   ,112  ,127      ,16     ,7.765          ,9.107               ,0.853 
64   ,1120 ,127      ,1024   ,55.837         ,57.316              ,0.974 
64   ,1120 ,127      ,512    ,36.537         ,38.32               ,0.953 
64   ,1152 ,127      ,1024   ,55.772         ,57.132              ,0.976 
64   ,1152 ,127      ,512    ,36.667         ,38.258              ,0.958 
64   ,1184 ,127      ,1024   ,55.83          ,57.747              ,0.967 
64   ,1184 ,127      ,512    ,36.546         ,38.311              ,0.954 
64   ,1216 ,127      ,1024   ,55.75          ,57.945              ,0.962 
64   ,1216 ,127      ,512    ,36.52          ,38.478              ,0.949 
64   ,1248 ,127      ,1024   ,55.72          ,56.268              ,0.99  
64   ,128  ,127      ,128    ,14.951         ,17.567              ,0.851 
64   ,128  ,127      ,160    ,18.82          ,19.533              ,0.963 
64   ,128  ,127      ,192    ,17.623         ,17.414              ,1.012 
64   ,128  ,127      ,224    ,17.614         ,17.218              ,1.023 
64   ,128  ,127      ,256    ,17.599         ,16.681              ,1.055 
64   ,128  ,127      ,288    ,17.587         ,17.555              ,1.002 
64   ,128  ,127      ,32     ,8.451          ,9.044               ,0.934 
64   ,128  ,127      ,320    ,17.591         ,16.707              ,1.053 
64   ,128  ,127      ,352    ,17.603         ,16.752              ,1.051 
64   ,128  ,127      ,384    ,17.581         ,17.236              ,1.02  
64   ,128  ,127      ,416    ,17.593         ,16.641              ,1.057 
64   ,128  ,127      ,448    ,17.571         ,16.475              ,1.067 
64   ,128  ,127      ,480    ,17.636         ,16.765              ,1.052 
64   ,128  ,127      ,512    ,17.594         ,16.557              ,1.063 
64   ,128  ,127      ,544    ,17.601         ,16.699              ,1.054 
64   ,128  ,127      ,576    ,17.587         ,16.917              ,1.04  
64   ,128  ,127      ,608    ,17.58          ,16.869              ,1.042 
64   ,128  ,127      ,64     ,11.533         ,15.234              ,0.757 
64   ,128  ,127      ,640    ,17.605         ,16.752              ,1.051 
64   ,128  ,127      ,672    ,17.598         ,16.915              ,1.04  
64   ,128  ,127      ,704    ,17.584         ,16.814              ,1.046 
64   ,128  ,127      ,736    ,17.604         ,16.323              ,1.078 
64   ,128  ,127      ,768    ,17.607         ,17.409              ,1.011 
64   ,128  ,127      ,800    ,17.617         ,16.328              ,1.079 
64   ,128  ,127      ,832    ,17.609         ,16.614              ,1.06  
64   ,128  ,127      ,96     ,12.296         ,16.585              ,0.741 
64   ,1280 ,127      ,1024   ,55.662         ,56.854              ,0.979 
64   ,1312 ,127      ,1024   ,55.745         ,56.286              ,0.99  
64   ,1344 ,127      ,1024   ,55.767         ,56.956              ,0.979 
64   ,1344 ,127      ,2048   ,76.337         ,73.18               ,1.043 
64   ,1376 ,127      ,1024   ,55.742         ,56.238              ,0.991 
64   ,1376 ,127      ,2048   ,76.411         ,72.703              ,1.051 
64   ,1408 ,127      ,1024   ,55.742         ,56.724              ,0.983 
64   ,1408 ,127      ,2048   ,79.436         ,75.642              ,1.05  
64   ,144  ,127      ,16     ,7.986          ,8.943               ,0.893 
64   ,1440 ,127      ,1024   ,55.829         ,56.224              ,0.993 
64   ,1440 ,127      ,2048   ,79.46          ,75.995              ,1.046 
64   ,1472 ,127      ,1024   ,55.69          ,57.423              ,0.97  
64   ,1472 ,127      ,2048   ,80.68          ,78.008              ,1.034 
64   ,1504 ,127      ,1024   ,55.713         ,56.294              ,0.99  
64   ,1504 ,127      ,2048   ,80.871         ,77.185              ,1.048 
64   ,1536 ,127      ,1024   ,55.79          ,56.47               ,0.988 
64   ,1536 ,127      ,2048   ,84.926         ,79.761              ,1.065 
64   ,1568 ,127      ,1024   ,55.771         ,57.598              ,0.968 
64   ,1568 ,127      ,2048   ,83.826         ,80.832              ,1.037 
64   ,16   ,127      ,112    ,8.045          ,7.761               ,1.037 
64   ,16   ,127      ,144    ,8.06           ,7.325               ,1.1   
64   ,16   ,127      ,16     ,7.659          ,8.781               ,0.872 
64   ,16   ,127      ,176    ,8.018          ,7.925               ,1.012 
64   ,16   ,127      ,208    ,8.175          ,7.767               ,1.052 
64   ,16   ,127      ,240    ,8.392          ,7.472               ,1.123 
64   ,16   ,127      ,272    ,7.988          ,7.458               ,1.071 
64   ,16   ,127      ,304    ,8.391          ,7.469               ,1.123 
64   ,16   ,127      ,336    ,7.987          ,7.611               ,1.05  
64   ,16   ,127      ,368    ,8.207          ,7.466               ,1.099 
64   ,16   ,127      ,400    ,7.982          ,7.631               ,1.046 
64   ,16   ,127      ,432    ,8.017          ,7.692               ,1.042 
64   ,16   ,127      ,464    ,7.986          ,8.078               ,0.989 
64   ,16   ,127      ,48     ,8.09           ,7.976               ,1.014 
64   ,16   ,127      ,496    ,8.022          ,7.466               ,1.074 
64   ,16   ,127      ,528    ,7.77           ,7.904               ,0.983 
64   ,16   ,127      ,560    ,7.872          ,7.785               ,1.011 
64   ,16   ,127      ,592    ,8.124          ,7.612               ,1.067 
64   ,16   ,127      ,624    ,7.999          ,7.642               ,1.047 
64   ,16   ,127      ,656    ,8.09           ,8.08                ,1.001 
64   ,16   ,127      ,688    ,8.017          ,7.911               ,1.013 
64   ,16   ,127      ,720    ,7.77           ,8.901               ,0.873 
64   ,16   ,127      ,80     ,8.277          ,8.178               ,1.012 
64   ,160  ,127      ,128    ,15.611         ,18.063              ,0.864 
64   ,160  ,127      ,256    ,20.686         ,21.588              ,0.958 
64   ,160  ,127      ,32     ,8.055          ,9.108               ,0.884 
64   ,160  ,127      ,512    ,30.56          ,27.229              ,1.122 
64   ,160  ,127      ,64     ,11.603         ,15.072              ,0.77  
64   ,1600 ,127      ,1024   ,55.78          ,57.748              ,0.966 
64   ,1600 ,127      ,2048   ,84.93          ,81.269              ,1.045 
64   ,1632 ,127      ,1024   ,55.708         ,56.24               ,0.991 
64   ,1632 ,127      ,2048   ,85.373         ,84.345              ,1.012 
64   ,1664 ,127      ,1024   ,55.749         ,57.664              ,0.967 
64   ,1664 ,127      ,2048   ,88.322         ,83.584              ,1.057 
64   ,1696 ,127      ,1024   ,55.741         ,56.289              ,0.99  
64   ,1696 ,127      ,2048   ,88.315         ,83.622              ,1.056 
64   ,1728 ,127      ,1024   ,55.793         ,56.68               ,0.984 
64   ,1728 ,127      ,2048   ,89.163         ,87.273              ,1.022 
64   ,176  ,127      ,16     ,7.985          ,8.943               ,0.893 
64   ,1760 ,127      ,2048   ,89.93          ,89.578              ,1.004 
64   ,1792 ,127      ,2048   ,92.85          ,88.006              ,1.055 
64   ,1824 ,127      ,2048   ,92.89          ,99.222              ,0.936 
64   ,1856 ,127      ,2048   ,93.394         ,94.794              ,0.985 
64   ,1888 ,127      ,2048   ,94.411         ,90.86               ,1.039 
64   ,192  ,127      ,128    ,16.037         ,18.753              ,0.855 
64   ,192  ,127      ,256    ,22.517         ,23.013              ,0.978 
64   ,192  ,127      ,32     ,8.018          ,8.948               ,0.896 
64   ,192  ,127      ,512    ,32.004         ,32.347              ,0.989 
64   ,192  ,127      ,64     ,11.592         ,14.371              ,0.807 
64   ,1920 ,127      ,2048   ,97.241         ,93.101              ,1.044 
64   ,1952 ,127      ,2048   ,97.428         ,94.797              ,1.028 
64   ,1984 ,127      ,2048   ,97.133         ,90.174              ,1.077 
64   ,2016 ,127      ,2048   ,98.795         ,91.341              ,1.082 
64   ,2048 ,127      ,1344   ,65.138         ,68.223              ,0.955 
64   ,2048 ,127      ,1376   ,69.864         ,67.995              ,1.027 
64   ,2048 ,127      ,1408   ,69.756         ,69.011              ,1.011 
64   ,2048 ,127      ,1440   ,69.704         ,73.781              ,0.945 
64   ,2048 ,127      ,1472   ,69.865         ,74.31               ,0.94  
64   ,2048 ,127      ,1504   ,73.951         ,76.322              ,0.969 
64   ,2048 ,127      ,1536   ,74.002         ,73.291              ,1.01  
64   ,2048 ,127      ,1568   ,73.91          ,77.498              ,0.954 
64   ,2048 ,127      ,1600   ,74.011         ,76.796              ,0.964 
64   ,2048 ,127      ,1632   ,78.217         ,80.64               ,0.97  
64   ,2048 ,127      ,1664   ,78.286         ,77.664              ,1.008 
64   ,2048 ,127      ,1696   ,78.253         ,81.062              ,0.965 
64   ,2048 ,127      ,1728   ,78.262         ,82.213              ,0.952 
64   ,2048 ,127      ,1760   ,82.727         ,87.374              ,0.947 
64   ,2048 ,127      ,1792   ,84.58          ,82.094              ,1.03  
64   ,2048 ,127      ,1824   ,83.053         ,85.553              ,0.971 
64   ,2048 ,127      ,1856   ,84.106         ,89.528              ,0.939 
64   ,2048 ,127      ,1888   ,87.284         ,89.447              ,0.976 
64   ,2048 ,127      ,1920   ,87.312         ,88.408              ,0.988 
64   ,2048 ,127      ,1952   ,87.307         ,99.35               ,0.879 
64   ,2048 ,127      ,1984   ,87.275         ,91.012              ,0.959 
64   ,2048 ,127      ,2016   ,101.886        ,91.884              ,1.109 
64   ,2048 ,127      ,2048   ,101.515        ,89.226              ,1.138 
64   ,2048 ,127      ,2080   ,101.94         ,99.396              ,1.026 
64   ,2048 ,127      ,2112   ,101.904        ,96.903              ,1.052 
64   ,2048 ,127      ,2144   ,101.87         ,99.579              ,1.023 
64   ,2048 ,127      ,2176   ,101.849        ,96.54               ,1.055 
64   ,2048 ,127      ,2208   ,101.879        ,98.68               ,1.032 
64   ,2048 ,127      ,2240   ,101.91         ,102.184             ,0.997 
64   ,2048 ,127      ,2272   ,101.87         ,104.041             ,0.979 
64   ,2048 ,127      ,2304   ,101.912        ,96.477              ,1.056 
64   ,2048 ,127      ,2336   ,101.909        ,98.526              ,1.034 
64   ,2048 ,127      ,2368   ,101.899        ,96.566              ,1.055 
64   ,2048 ,127      ,2400   ,101.916        ,96.489              ,1.056 
64   ,2048 ,127      ,2432   ,101.903        ,96.423              ,1.057 
64   ,2048 ,127      ,2464   ,101.905        ,99.235              ,1.027 
64   ,2048 ,127      ,2496   ,104.879        ,96.592              ,1.086 
64   ,2048 ,127      ,2528   ,101.86         ,96.762              ,1.053 
64   ,2048 ,127      ,2560   ,101.881        ,96.481              ,1.056 
64   ,2048 ,127      ,2592   ,101.88         ,96.514              ,1.056 
64   ,2048 ,127      ,2624   ,101.892        ,98.573              ,1.034 
64   ,2048 ,127      ,2656   ,101.857        ,96.487              ,1.056 
64   ,2048 ,127      ,2688   ,101.889        ,98.711              ,1.032 
64   ,2048 ,127      ,2720   ,101.908        ,96.524              ,1.056 
64   ,2048 ,127      ,2752   ,101.91         ,96.637              ,1.055 
64   ,208  ,127      ,16     ,7.981          ,9.125               ,0.875 
64   ,2080 ,127      ,2048   ,102.44         ,89.479              ,1.145 
64   ,2112 ,127      ,2048   ,91.705         ,89.65               ,1.023 
64   ,2144 ,127      ,2048   ,91.734         ,89.971              ,1.02  
64   ,2176 ,127      ,2048   ,91.835         ,89.61               ,1.025 
64   ,2208 ,127      ,2048   ,91.823         ,92.301              ,0.995 
64   ,224  ,127      ,128    ,15.289         ,18.061              ,0.847 
64   ,224  ,127      ,256    ,25.093         ,24.496              ,1.024 
64   ,224  ,127      ,32     ,7.985          ,8.786               ,0.909 
64   ,224  ,127      ,512    ,33.216         ,31.969              ,1.039 
64   ,224  ,127      ,64     ,11.702         ,15.55               ,0.753 
64   ,2240 ,127      ,2048   ,91.783         ,89.771              ,1.022 
64   ,2272 ,127      ,2048   ,91.741         ,95.858              ,0.957 
64   ,2304 ,127      ,2048   ,91.698         ,92.925              ,0.987 
64   ,2336 ,127      ,2048   ,91.693         ,91.869              ,0.998 
64   ,2368 ,127      ,2048   ,91.767         ,89.529              ,1.025 
64   ,240  ,127      ,16     ,7.95           ,9.458               ,0.841 
64   ,2400 ,127      ,2048   ,91.725         ,92.771              ,0.989 
64   ,2432 ,127      ,2048   ,93.544         ,89.835              ,1.041 
64   ,2464 ,127      ,2048   ,91.614         ,91.863              ,0.997 
64   ,2496 ,127      ,2048   ,91.719         ,92.649              ,0.99  
64   ,2528 ,127      ,2048   ,91.735         ,89.808              ,1.021 
64   ,256  ,127      ,128    ,16.115         ,18.733              ,0.86  
64   ,256  ,127      ,160    ,17.265         ,18.078              ,0.955 
64   ,256  ,127      ,192    ,20.265         ,21.571              ,0.939 
64   ,256  ,127      ,224    ,22.035         ,22.455              ,0.981 
64   ,256  ,127      ,256    ,24.459         ,23.401              ,1.045 
64   ,256  ,127      ,288    ,27.516         ,26.141              ,1.053 
64   ,256  ,127      ,32     ,8.08           ,8.966               ,0.901 
64   ,256  ,127      ,320    ,35.289         ,32.263              ,1.094 
64   ,256  ,127      ,352    ,35.299         ,31.886              ,1.107 
64   ,256  ,127      ,384    ,35.324         ,31.95               ,1.106 
64   ,256  ,127      ,416    ,35.299         ,33.723              ,1.047 
64   ,256  ,127      ,448    ,35.356         ,31.778              ,1.113 
64   ,256  ,127      ,480    ,35.331         ,31.728              ,1.114 
64   ,256  ,127      ,512    ,35.305         ,31.78               ,1.111 
64   ,256  ,127      ,544    ,35.298         ,31.759              ,1.111 
64   ,256  ,127      ,576    ,35.276         ,31.757              ,1.111 
64   ,256  ,127      ,608    ,35.297         ,31.762              ,1.111 
64   ,256  ,127      ,64     ,11.657         ,15.239              ,0.765 
64   ,256  ,127      ,640    ,35.423         ,31.735              ,1.116 
64   ,256  ,127      ,672    ,35.299         ,31.788              ,1.11  
64   ,256  ,127      ,704    ,35.371         ,31.742              ,1.114 
64   ,256  ,127      ,736    ,35.294         ,31.754              ,1.111 
64   ,256  ,127      ,768    ,35.314         ,31.747              ,1.112 
64   ,256  ,127      ,800    ,35.289         ,31.731              ,1.112 
64   ,256  ,127      ,832    ,35.291         ,31.744              ,1.112 
64   ,256  ,127      ,864    ,35.304         ,31.789              ,1.111 
64   ,256  ,127      ,896    ,35.312         ,31.775              ,1.111 
64   ,256  ,127      ,928    ,35.306         ,31.767              ,1.111 
64   ,256  ,127      ,96     ,12.99          ,15.93               ,0.815 
64   ,256  ,127      ,960    ,35.303         ,31.738              ,1.112 
64   ,2560 ,127      ,2048   ,91.702         ,89.771              ,1.022 
64   ,2592 ,127      ,2048   ,91.794         ,89.687              ,1.023 
64   ,2624 ,127      ,2048   ,91.692         ,96.465              ,0.951 
64   ,2656 ,127      ,2048   ,91.783         ,89.551              ,1.025 
64   ,2688 ,127      ,2048   ,91.787         ,89.754              ,1.023 
64   ,272  ,127      ,16     ,7.873          ,9.114               ,0.864 
64   ,2720 ,127      ,2048   ,92.755         ,89.501              ,1.036 
64   ,2752 ,127      ,2048   ,91.612         ,89.613              ,1.022 
64   ,288  ,127      ,128    ,15.772         ,17.991              ,0.877 
64   ,288  ,127      ,256    ,24.482         ,24.561              ,0.997 
64   ,288  ,127      ,32     ,7.995          ,8.788               ,0.91  
64   ,288  ,127      ,512    ,35.334         ,31.832              ,1.11  
64   ,288  ,127      ,64     ,11.718         ,15.726              ,0.745 
64   ,304  ,127      ,16     ,8.038          ,9.446               ,0.851 
64   ,32   ,127      ,128    ,12.379         ,16.075              ,0.77  
64   ,32   ,127      ,160    ,12.299         ,16.699              ,0.737 
64   ,32   ,127      ,192    ,12.308         ,13.155              ,0.936 
64   ,32   ,127      ,224    ,12.301         ,12.975              ,0.948 
64   ,32   ,127      ,256    ,12.307         ,13.601              ,0.905 
64   ,32   ,127      ,288    ,12.294         ,13.125              ,0.937 
64   ,32   ,127      ,32     ,7.66           ,8.781               ,0.872 
64   ,32   ,127      ,320    ,12.345         ,12.939              ,0.954 
64   ,32   ,127      ,352    ,12.299         ,13.598              ,0.904 
64   ,32   ,127      ,384    ,12.294         ,13.147              ,0.935 
64   ,32   ,127      ,416    ,12.342         ,12.807              ,0.964 
64   ,32   ,127      ,448    ,12.336         ,12.772              ,0.966 
64   ,32   ,127      ,480    ,12.294         ,13.09               ,0.939 
64   ,32   ,127      ,512    ,12.302         ,12.968              ,0.949 
64   ,32   ,127      ,544    ,12.299         ,12.801              ,0.961 
64   ,32   ,127      ,576    ,12.335         ,12.474              ,0.989 
64   ,32   ,127      ,608    ,12.336         ,12.551              ,0.983 
64   ,32   ,127      ,64     ,12.454         ,15.309              ,0.814 
64   ,32   ,127      ,640    ,12.318         ,13.428              ,0.917 
64   ,32   ,127      ,672    ,12.294         ,12.801              ,0.96  
64   ,32   ,127      ,704    ,12.317         ,12.469              ,0.988 
64   ,32   ,127      ,736    ,12.299         ,12.824              ,0.959 
64   ,32   ,127      ,96     ,12.414         ,15.582              ,0.797 
64   ,320  ,127      ,1024   ,37.12          ,36.922              ,1.005 
64   ,320  ,127      ,128    ,15.531         ,18.152              ,0.856 
64   ,320  ,127      ,256    ,24.657         ,24.923              ,0.989 
64   ,320  ,127      ,32     ,8.09           ,9.282               ,0.872 
64   ,320  ,127      ,512    ,37.521         ,36.809              ,1.019 
64   ,320  ,127      ,64     ,11.702         ,15.282              ,0.766 
64   ,336  ,127      ,16     ,7.872          ,9.208               ,0.855 
64   ,3392 ,127      ,4096   ,142.48         ,192.837             ,0.739 
64   ,3424 ,127      ,4096   ,145.42         ,192.324             ,0.756 
64   ,3456 ,127      ,4096   ,149.279        ,197.89              ,0.754 
64   ,3488 ,127      ,4096   ,149.085        ,194.594             ,0.766 
64   ,352  ,127      ,1024   ,38.319         ,36.879              ,1.039 
64   ,352  ,127      ,128    ,15.359         ,18.48               ,0.831 
64   ,352  ,127      ,256    ,24.475         ,24.979              ,0.98  
64   ,352  ,127      ,32     ,8.277          ,9.275               ,0.892 
64   ,352  ,127      ,512    ,38.31          ,36.782              ,1.042 
64   ,352  ,127      ,64     ,11.704         ,15.576              ,0.751 
64   ,3520 ,127      ,4096   ,146.936        ,196.64              ,0.747 
64   ,3552 ,127      ,4096   ,149.58         ,194.169             ,0.77  
64   ,3584 ,127      ,4096   ,153.647        ,192.594             ,0.798 
64   ,3616 ,127      ,4096   ,153.753        ,194.453             ,0.791 
64   ,3648 ,127      ,4096   ,151.528        ,194.552             ,0.779 
64   ,368  ,127      ,16     ,8.371          ,8.948               ,0.936 
64   ,3680 ,127      ,4096   ,153.849        ,195.728             ,0.786 
64   ,3712 ,127      ,4096   ,158.049        ,201.301             ,0.785 
64   ,3744 ,127      ,4096   ,158.077        ,199.971             ,0.79  
64   ,3776 ,127      ,4096   ,155.904        ,199.662             ,0.781 
64   ,3808 ,127      ,4096   ,159.441        ,204.15              ,0.781 
64   ,384  ,127      ,1024   ,40.642         ,36.491              ,1.114 
64   ,384  ,127      ,128    ,15.605         ,19.163              ,0.814 
64   ,384  ,127      ,256    ,24.459         ,24.134              ,1.013 
64   ,384  ,127      ,32     ,8.239          ,9.035               ,0.912 
64   ,384  ,127      ,512    ,40.663         ,36.5                ,1.114 
64   ,384  ,127      ,64     ,11.714         ,15.514              ,0.755 
64   ,3840 ,127      ,4096   ,162.766        ,205.416             ,0.792 
64   ,3872 ,127      ,4096   ,162.281        ,204.965             ,0.792 
64   ,3904 ,127      ,4096   ,162.984        ,204.368             ,0.798 
64   ,3936 ,127      ,4096   ,166.82         ,205.068             ,0.813 
64   ,3968 ,127      ,4096   ,166.561        ,205.982             ,0.809 
64   ,400  ,127      ,16     ,8.277          ,9.275               ,0.892 
64   ,4000 ,127      ,4096   ,166.61         ,205.727             ,0.81  
64   ,4032 ,127      ,4096   ,166.001        ,182.025             ,0.912 
64   ,4064 ,127      ,4096   ,170.568        ,183.146             ,0.931 
64   ,4096 ,127      ,3392   ,134.25         ,167.121             ,0.803 
64   ,4096 ,127      ,3424   ,138.383        ,170.362             ,0.812 
64   ,4096 ,127      ,3456   ,138.382        ,169.139             ,0.818 
64   ,4096 ,127      ,3488   ,138.307        ,175.368             ,0.789 
64   ,4096 ,127      ,3520   ,138.249        ,167.96              ,0.823 
64   ,4096 ,127      ,3552   ,142.7          ,170.201             ,0.838 
64   ,4096 ,127      ,3584   ,142.6          ,171.287             ,0.833 
64   ,4096 ,127      ,3616   ,142.872        ,177.928             ,0.803 
64   ,4096 ,127      ,3648   ,142.755        ,168.606             ,0.847 
64   ,4096 ,127      ,3680   ,146.907        ,172.935             ,0.849 
64   ,4096 ,127      ,3712   ,146.919        ,170.171             ,0.863 
64   ,4096 ,127      ,3744   ,149.022        ,176.907             ,0.842 
64   ,4096 ,127      ,3776   ,146.889        ,179.14              ,0.82  
64   ,4096 ,127      ,3808   ,151.458        ,175.67              ,0.862 
64   ,4096 ,127      ,3840   ,152.743        ,177.074             ,0.863 
64   ,4096 ,127      ,3872   ,151.354        ,179.163             ,0.845 
64   ,4096 ,127      ,3904   ,151.249        ,176.688             ,0.856 
64   ,4096 ,127      ,3936   ,164.341        ,187.46              ,0.877 
64   ,4096 ,127      ,3968   ,155.67         ,180.712             ,0.861 
64   ,4096 ,127      ,4000   ,155.521        ,186.318             ,0.835 
64   ,4096 ,127      ,4032   ,158.276        ,184.134             ,0.86  
64   ,4096 ,127      ,4064   ,170.873        ,182.524             ,0.936 
64   ,4096 ,127      ,4096   ,170.591        ,176.172             ,0.968 
64   ,4096 ,127      ,4128   ,170.902        ,182.988             ,0.934 
64   ,4096 ,127      ,4160   ,176.928        ,180.486             ,0.98  
64   ,4096 ,127      ,4192   ,206.3          ,209.978             ,0.982 
64   ,4096 ,127      ,4224   ,195.332        ,209.087             ,0.934 
64   ,4096 ,127      ,4256   ,180.675        ,212.911             ,0.849 
64   ,4096 ,127      ,4288   ,175.117        ,223.157             ,0.785 
64   ,4096 ,127      ,4320   ,170.977        ,209.594             ,0.816 
64   ,4096 ,127      ,4352   ,170.872        ,214.023             ,0.798 
64   ,4096 ,127      ,4384   ,171.021        ,210.292             ,0.813 
64   ,4096 ,127      ,4416   ,171.041        ,208.592             ,0.82  
64   ,4096 ,127      ,4448   ,170.921        ,213.242             ,0.802 
64   ,4096 ,127      ,4480   ,175.356        ,209.464             ,0.837 
64   ,4096 ,127      ,4512   ,170.966        ,207.261             ,0.825 
64   ,4096 ,127      ,4544   ,174.869        ,209.292             ,0.836 
64   ,4096 ,127      ,4576   ,170.922        ,207.908             ,0.822 
64   ,4096 ,127      ,4608   ,175.563        ,211.322             ,0.831 
64   ,4096 ,127      ,4640   ,170.879        ,217.164             ,0.787 
64   ,4096 ,127      ,4672   ,184.643        ,209.815             ,0.88  
64   ,4096 ,127      ,4704   ,171.124        ,215.689             ,0.793 
64   ,4096 ,127      ,4736   ,180.595        ,217.091             ,0.832 
64   ,4096 ,127      ,4768   ,170.862        ,212.905             ,0.803 
64   ,4096 ,127      ,4800   ,171.022        ,210.846             ,0.811 
64   ,4128 ,127      ,4096   ,171.589        ,185.468             ,0.925 
64   ,416  ,127      ,1024   ,40.673         ,36.987              ,1.1   
64   ,416  ,127      ,128    ,15.241         ,18.563              ,0.821 
64   ,416  ,127      ,256    ,24.472         ,25.273              ,0.968 
64   ,416  ,127      ,32     ,8.169          ,8.955               ,0.912 
64   ,416  ,127      ,512    ,40.665         ,36.518              ,1.114 
64   ,416  ,127      ,64     ,11.51          ,15.211              ,0.757 
64   ,4160 ,127      ,4096   ,159.801        ,186.421             ,0.857 
64   ,4192 ,127      ,4096   ,159.891        ,181.758             ,0.88  
64   ,4224 ,127      ,4096   ,159.878        ,182.758             ,0.875 
64   ,4256 ,127      ,4096   ,161.458        ,184.396             ,0.876 
64   ,4288 ,127      ,4096   ,161.191        ,183.008             ,0.881 
64   ,432  ,127      ,16     ,7.766          ,9.107               ,0.853 
64   ,4320 ,127      ,4096   ,159.789        ,182.923             ,0.874 
64   ,4352 ,127      ,4096   ,165.247        ,185.383             ,0.891 
64   ,4384 ,127      ,4096   ,161.743        ,182.33              ,0.887 
64   ,4416 ,127      ,4096   ,160.01         ,183.496             ,0.872 
64   ,4448 ,127      ,4096   ,162.218        ,181.611             ,0.893 
64   ,448  ,127      ,1024   ,42.065         ,41.352              ,1.017 
64   ,448  ,127      ,128    ,15.186         ,18.339              ,0.828 
64   ,448  ,127      ,256    ,24.512         ,24.805              ,0.988 
64   ,448  ,127      ,32     ,8.277          ,8.787               ,0.942 
64   ,448  ,127      ,512    ,42.013         ,37.53               ,1.119 
64   ,448  ,127      ,64     ,11.596         ,14.871              ,0.78  
64   ,4480 ,127      ,4096   ,163.127        ,183.933             ,0.887 
64   ,4512 ,127      ,4096   ,164.937        ,186.469             ,0.885 
64   ,4544 ,127      ,4096   ,161.351        ,181.491             ,0.889 
64   ,4576 ,127      ,4096   ,162.731        ,177.275             ,0.918 
64   ,4608 ,127      ,4096   ,164.463        ,181.843             ,0.904 
64   ,464  ,127      ,16     ,8.047          ,9.629               ,0.836 
64   ,4640 ,127      ,4096   ,160.0          ,181.165             ,0.883 
64   ,4672 ,127      ,4096   ,161.484        ,185.762             ,0.869 
64   ,4704 ,127      ,4096   ,159.8          ,183.296             ,0.872 
64   ,4736 ,127      ,4096   ,161.754        ,188.718             ,0.857 
64   ,4768 ,127      ,4096   ,161.781        ,183.893             ,0.88  
64   ,48   ,127      ,16     ,8.071          ,8.96                ,0.901 
64   ,480  ,127      ,1024   ,43.12          ,41.388              ,1.042 
64   ,480  ,127      ,128    ,15.086         ,19.072              ,0.791 
64   ,480  ,127      ,256    ,24.499         ,24.671              ,0.993 
64   ,480  ,127      ,32     ,8.095          ,9.281               ,0.872 
64   ,480  ,127      ,512    ,43.09          ,39.439              ,1.093 
64   ,480  ,127      ,64     ,11.823         ,15.559              ,0.76  
64   ,4800 ,127      ,4096   ,159.781        ,188.592             ,0.847 
64   ,496  ,127      ,16     ,8.164          ,9.113               ,0.896 
64   ,512  ,127      ,1024   ,45.669         ,41.583              ,1.098 
64   ,512  ,127      ,1056   ,45.632         ,41.507              ,1.099 
64   ,512  ,127      ,1088   ,45.69          ,41.573              ,1.099 
64   ,512  ,127      ,1120   ,45.63          ,41.96               ,1.087 
64   ,512  ,127      ,1152   ,45.679         ,41.513              ,1.1   
64   ,512  ,127      ,1184   ,45.643         ,41.485              ,1.1   
64   ,512  ,127      ,1216   ,45.634         ,41.481              ,1.1   
64   ,512  ,127      ,128    ,16.158         ,19.405              ,0.833 
64   ,512  ,127      ,160    ,16.667         ,18.292              ,0.911 
64   ,512  ,127      ,192    ,19.931         ,22.015              ,0.905 
64   ,512  ,127      ,224    ,21.976         ,23.494              ,0.935 
64   ,512  ,127      ,256    ,24.613         ,24.359              ,1.01  
64   ,512  ,127      ,288    ,26.287         ,25.949              ,1.013 
64   ,512  ,127      ,32     ,8.278          ,9.032               ,0.916 
64   ,512  ,127      ,320    ,26.777         ,33.628              ,0.796 
64   ,512  ,127      ,352    ,31.718         ,32.567              ,0.974 
64   ,512  ,127      ,384    ,31.644         ,33.668              ,0.94  
64   ,512  ,127      ,416    ,31.653         ,34.469              ,0.918 
64   ,512  ,127      ,448    ,31.661         ,37.227              ,0.85  
64   ,512  ,127      ,480    ,45.612         ,37.516              ,1.216 
64   ,512  ,127      ,512    ,45.445         ,38.135              ,1.192 
64   ,512  ,127      ,544    ,45.647         ,41.604              ,1.097 
64   ,512  ,127      ,576    ,45.612         ,42.634              ,1.07  
64   ,512  ,127      ,608    ,47.646         ,41.531              ,1.147 
64   ,512  ,127      ,64     ,11.82          ,14.893              ,0.794 
64   ,512  ,127      ,640    ,45.647         ,41.497              ,1.1   
64   ,512  ,127      ,672    ,45.657         ,41.698              ,1.095 
64   ,512  ,127      ,704    ,45.635         ,41.689              ,1.095 
64   ,512  ,127      ,736    ,45.631         ,41.568              ,1.098 
64   ,512  ,127      ,768    ,45.623         ,41.785              ,1.092 
64   ,512  ,127      ,800    ,45.639         ,41.499              ,1.1   
64   ,512  ,127      ,832    ,45.747         ,41.524              ,1.102 
64   ,512  ,127      ,864    ,45.628         ,41.5                ,1.099 
64   ,512  ,127      ,896    ,45.754         ,41.626              ,1.099 
64   ,512  ,127      ,928    ,45.66          ,42.102              ,1.085 
64   ,512  ,127      ,96     ,12.466         ,16.513              ,0.755 
64   ,512  ,127      ,960    ,46.467         ,41.564              ,1.118 
64   ,512  ,127      ,992    ,45.639         ,41.484              ,1.1   
64   ,528  ,127      ,16     ,8.146          ,9.17                ,0.888 
64   ,544  ,127      ,1024   ,45.65          ,41.55               ,1.099 
64   ,544  ,127      ,128    ,16.205         ,17.925              ,0.904 
64   ,544  ,127      ,256    ,24.496         ,23.797              ,1.029 
64   ,544  ,127      ,32     ,7.986          ,9.463               ,0.844 
64   ,544  ,127      ,512    ,45.613         ,38.692              ,1.179 
64   ,544  ,127      ,64     ,11.702         ,15.498              ,0.755 
64   ,560  ,127      ,16     ,7.931          ,9.284               ,0.854 
64   ,576  ,127      ,1024   ,47.078         ,46.296              ,1.017 
64   ,576  ,127      ,128    ,15.685         ,19.175              ,0.818 
64   ,576  ,127      ,256    ,24.469         ,24.79               ,0.987 
64   ,576  ,127      ,32     ,7.98           ,9.112               ,0.876 
64   ,576  ,127      ,512    ,36.553         ,39.743              ,0.92  
64   ,576  ,127      ,64     ,11.704         ,15.692              ,0.746 
64   ,592  ,127      ,16     ,7.987          ,9.275               ,0.861 
64   ,608  ,127      ,1024   ,47.954         ,46.277              ,1.036 
64   ,608  ,127      ,128    ,15.56          ,19.322              ,0.805 
64   ,608  ,127      ,256    ,24.471         ,24.902              ,0.983 
64   ,608  ,127      ,32     ,8.128          ,8.786               ,0.925 
64   ,608  ,127      ,512    ,36.538         ,38.313              ,0.954 
64   ,608  ,127      ,64     ,11.823         ,15.789              ,0.749 
64   ,624  ,127      ,16     ,8.239          ,8.949               ,0.921 
64   ,64   ,127      ,128    ,14.211         ,18.094              ,0.785 
64   ,64   ,127      ,160    ,14.186         ,17.917              ,0.792 
64   ,64   ,127      ,192    ,14.198         ,13.693              ,1.037 
64   ,64   ,127      ,224    ,14.167         ,13.813              ,1.026 
64   ,64   ,127      ,256    ,14.197         ,14.137              ,1.004 
64   ,64   ,127      ,288    ,14.179         ,13.739              ,1.032 
64   ,64   ,127      ,32     ,8.413          ,9.124               ,0.922 
64   ,64   ,127      ,320    ,14.193         ,14.154              ,1.003 
64   ,64   ,127      ,352    ,14.178         ,13.659              ,1.038 
64   ,64   ,127      ,384    ,14.17          ,13.727              ,1.032 
64   ,64   ,127      ,416    ,14.198         ,13.955              ,1.017 
64   ,64   ,127      ,448    ,14.183         ,14.055              ,1.009 
64   ,64   ,127      ,480    ,14.159         ,14.482              ,0.978 
64   ,64   ,127      ,512    ,14.209         ,14.355              ,0.99  
64   ,64   ,127      ,544    ,14.185         ,13.543              ,1.047 
64   ,64   ,127      ,576    ,14.186         ,13.246              ,1.071 
64   ,64   ,127      ,608    ,14.178         ,13.644              ,1.039 
64   ,64   ,127      ,64     ,11.14          ,14.054              ,0.793 
64   ,64   ,127      ,640    ,14.179         ,13.408              ,1.058 
64   ,64   ,127      ,672    ,14.162         ,13.757              ,1.029 
64   ,64   ,127      ,704    ,14.185         ,13.754              ,1.031 
64   ,64   ,127      ,736    ,14.187         ,14.247              ,0.996 
64   ,64   ,127      ,768    ,14.186         ,13.961              ,1.016 
64   ,64   ,127      ,96     ,14.472         ,16.34               ,0.886 
64   ,640  ,127      ,1024   ,50.528         ,46.462              ,1.088 
64   ,640  ,127      ,128    ,15.77          ,19.27               ,0.818 
64   ,640  ,127      ,256    ,24.464         ,23.926              ,1.022 
64   ,640  ,127      ,32     ,8.127          ,9.275               ,0.876 
64   ,640  ,127      ,512    ,36.539         ,38.523              ,0.949 
64   ,640  ,127      ,64     ,11.48          ,15.885              ,0.723 
64   ,656  ,127      ,16     ,8.09           ,9.604               ,0.842 
64   ,672  ,127      ,1024   ,50.522         ,46.415              ,1.088 
64   ,672  ,127      ,128    ,15.433         ,19.327              ,0.799 
64   ,672  ,127      ,256    ,24.473         ,24.043              ,1.018 
64   ,672  ,127      ,32     ,8.022          ,9.275               ,0.865 
64   ,672  ,127      ,512    ,36.526         ,38.373              ,0.952 
64   ,672  ,127      ,64     ,11.504         ,16.131              ,0.713 
64   ,688  ,127      ,16     ,7.772          ,9.446               ,0.823 
64   ,704  ,127      ,1024   ,51.762         ,51.756              ,1.0   
64   ,704  ,127      ,128    ,14.994         ,19.119              ,0.784 
64   ,704  ,127      ,256    ,24.468         ,24.094              ,1.016 
64   ,704  ,127      ,32     ,7.879          ,9.06                ,0.87  
64   ,704  ,127      ,512    ,36.534         ,38.902              ,0.939 
64   ,704  ,127      ,64     ,11.593         ,15.055              ,0.77  
64   ,720  ,127      ,16     ,8.176          ,9.409               ,0.869 
64   ,736  ,127      ,1024   ,53.036         ,51.776              ,1.024 
64   ,736  ,127      ,128    ,15.694         ,18.752              ,0.837 
64   ,736  ,127      ,256    ,24.559         ,24.598              ,0.998 
64   ,736  ,127      ,32     ,8.245          ,9.112               ,0.905 
64   ,736  ,127      ,512    ,36.553         ,38.249              ,0.956 
64   ,736  ,127      ,64     ,11.474         ,15.868              ,0.723 
64   ,7488 ,127      ,8192   ,335.579        ,309.388             ,1.085 
64   ,7520 ,127      ,8192   ,333.901        ,310.714             ,1.075 
64   ,7552 ,127      ,8192   ,337.389        ,309.396             ,1.09  
64   ,7584 ,127      ,8192   ,332.885        ,309.217             ,1.077 
64   ,7616 ,127      ,8192   ,340.286        ,312.344             ,1.089 
64   ,7648 ,127      ,8192   ,334.582        ,313.115             ,1.069 
64   ,768  ,127      ,1024   ,55.575         ,52.117              ,1.066 
64   ,768  ,127      ,128    ,16.385         ,19.115              ,0.857 
64   ,768  ,127      ,256    ,24.476         ,24.578              ,0.996 
64   ,768  ,127      ,512    ,36.545         ,38.263              ,0.955 
64   ,768  ,127      ,64     ,11.371         ,16.618              ,0.684 
64   ,7680 ,127      ,8192   ,339.98         ,319.364             ,1.065 
64   ,7712 ,127      ,8192   ,340.797        ,315.618             ,1.08  
64   ,7744 ,127      ,8192   ,337.328        ,318.327             ,1.06  
64   ,7776 ,127      ,8192   ,339.25         ,318.181             ,1.066 
64   ,7808 ,127      ,8192   ,342.429        ,315.16              ,1.087 
64   ,7840 ,127      ,8192   ,343.402        ,315.784             ,1.087 
64   ,7872 ,127      ,8192   ,345.654        ,322.79              ,1.071 
64   ,7904 ,127      ,8192   ,343.039        ,325.493             ,1.054 
64   ,7936 ,127      ,8192   ,346.126        ,321.615             ,1.076 
64   ,7968 ,127      ,8192   ,345.973        ,324.619             ,1.066 
64   ,80   ,127      ,16     ,8.127          ,9.107               ,0.892 
64   ,800  ,127      ,1024   ,55.356         ,51.892              ,1.067 
64   ,800  ,127      ,128    ,15.682         ,18.887              ,0.83  
64   ,800  ,127      ,256    ,24.659         ,23.97               ,1.029 
64   ,800  ,127      ,512    ,36.56          ,38.423              ,0.951 
64   ,8000 ,127      ,8192   ,351.063        ,324.66              ,1.081 
64   ,8032 ,127      ,8192   ,348.884        ,328.445             ,1.062 
64   ,8064 ,127      ,8192   ,345.907        ,330.401             ,1.047 
64   ,8096 ,127      ,8192   ,347.606        ,330.126             ,1.053 
64   ,832  ,127      ,1024   ,58.47          ,57.029              ,1.025 
64   ,832  ,127      ,128    ,15.717         ,18.825              ,0.835 
64   ,832  ,127      ,256    ,25.07          ,24.481              ,1.024 
64   ,832  ,127      ,512    ,36.517         ,38.58               ,0.947 
64   ,864  ,127      ,1024   ,58.007         ,56.577              ,1.025 
64   ,864  ,127      ,256    ,24.5           ,24.807              ,0.988 
64   ,864  ,127      ,512    ,36.539         ,38.751              ,0.943 
64   ,896  ,127      ,1024   ,60.306         ,57.951              ,1.041 
64   ,896  ,127      ,256    ,24.476         ,24.648              ,0.993 
64   ,896  ,127      ,512    ,36.547         ,38.437              ,0.951 
64   ,928  ,127      ,1024   ,60.318         ,56.638              ,1.065 
64   ,928  ,127      ,256    ,24.563         ,25.382              ,0.968 
64   ,928  ,127      ,512    ,36.51          ,39.339              ,0.928 
64   ,96   ,127      ,128    ,16.791         ,17.59               ,0.955 
64   ,96   ,127      ,256    ,15.692         ,15.259              ,1.028 
64   ,96   ,127      ,32     ,7.898          ,8.966               ,0.881 
64   ,96   ,127      ,512    ,15.754         ,15.908              ,0.99  
64   ,96   ,127      ,64     ,11.94          ,14.555              ,0.82  
64   ,960  ,127      ,1024   ,62.045         ,57.457              ,1.08  
64   ,960  ,127      ,256    ,24.646         ,24.87               ,0.991 
64   ,960  ,127      ,512    ,36.512         ,38.429              ,0.95  
64   ,992  ,127      ,1024   ,62.779         ,59.056              ,1.063 
64   ,992  ,127      ,512    ,36.502         ,38.5                ,0.948 
7    ,5000 ,127      ,7      ,7.766          ,9.279               ,0.837 
7    ,7    ,127      ,5000   ,8.023          ,7.318               ,1.096 
7    ,7    ,127      ,6      ,8.017          ,8.829               ,0.908 
7    ,7    ,127      ,7      ,7.873          ,8.958               ,0.879 
7    ,7    ,127      ,8      ,7.992          ,8.781               ,0.91  
0.9468079980272118
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 2109ec2f7a..487846f098 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@ 
 #  define STRLEN	__strlen_evex
 # endif
 
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
-#  define SHIFT_REG ecx
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
-#  define SHIFT_REG edx
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-	mov	%RSI_LP, %R8_LP
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	/* Clear high bits from edi. Only keeping bits relevant to page
-	   cross check.  */
+	vpxorq	%XZERO, %XZERO, %XZERO
 	andl	$(PAGE_SIZE - 1), %eax
-	/* Check if we may cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-# ifdef USE_AS_STRNLEN
-	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$CHAR_PER_VEC, %rsi
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-	ret
-# ifdef USE_AS_STRNLEN
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Set bit for max len so that tzcnt will return min of max len
-	   and position of first match.  */
-	btsq	%rsi, %rax
-	tzcntl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+	bsf	%VRAX, %VRAX
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x4):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
 	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
 	ret
 
-	.p2align 5
+
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
 L(aligned_more):
-	movq	%rdi, %rdx
-	/* Align data to VEC_SIZE.  */
-	andq	$-(VEC_SIZE), %rdi
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
 L(cross_page_continue):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-# ifdef USE_AS_STRNLEN
-	/* + CHAR_SIZE because it simplies the logic in
-	   last_4x_vec_or_less.  */
-	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
-	subq	%rdx, %rcx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
-	/* Adjust length. If near end handle specially.  */
-	subq	%rcx, %rsi
-	jb	L(last_4x_vec_or_less)
-# endif
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	test	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x4)
 
-	addq	$VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
-	/* Check if at last VEC_SIZE * 4 length.  */
-	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
-	jbe	L(last_4x_vec_or_less_load)
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-	/* Readjust length.  */
-	addq	%rcx, %rsi
-# endif
-	/* Align data to VEC_SIZE * 4.  */
+	subq	$(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
 	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+
 
 	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Load first VEC regardless.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rsi
-	jb	L(last_4x_vec_or_less_cmpeq)
-# endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
-	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	VPCMP	$0, %YMM4, %YMMZERO, %k1
 	subq	$-(VEC_SIZE * 4), %rdi
-	kortestd	%k0, %k1
+	KORTEST %k0, %k2
 	jz	L(loop_4x_vec)
 
-	/* Check if end was in first half.  */
-	kmovd	%k0, %eax
-	subq	%rdx, %rdi
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rdi
-# endif
-	testl	%eax, %eax
-	jz	L(second_vec_return)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
 
-	VPCMP	$0, %YMM1, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
-# ifdef USE_AS_WCSLEN
-	sall	$CHAR_PER_VEC, %eax
-	orl	%edx, %eax
-	tzcntl	%eax, %eax
-# else
-	salq	$CHAR_PER_VEC, %rax
-	orq	%rdx, %rax
-	tzcntq	%rax, %rax
-# endif
-	addq	%rdi, %rax
-	ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
-	addq	$(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
-	kmovd	%k0, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
-	testl	$(CHAR_PER_VEC * 2), %esi
-	jnz	L(last_4x_vec)
-
-	/* length may have been negative or positive by an offset of
-	   CHAR_PER_VEC * 4 depending on where this was called from. This
-	   fixes that.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-	/* Check the end of data.  */
-	subl	$CHAR_PER_VEC, %esi
-	jb	L(max)
+	VPTESTN	%VMM(3), %VMM(3), %k0
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-L(max):
-	movq	%r8, %rax
-	ret
-# endif
-
-	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
-	   in the 4x VEC loop can use 2 byte encoding.  */
-	.p2align 4
-L(second_vec_return):
-	VPCMP	$0, %YMM3, %YMMZERO, %k0
-	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
-# ifdef USE_AS_WCSLEN
-	kunpckbw	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	kunpckdq	%k0, %k1, %k0
-	kmovq	%k0, %rax
-	tzcntq	%rax, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
 # endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
 
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec):
-	/* Test first 2x VEC normally.  */
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	/* Normalize length.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	/* Check the end of data.  */
-	subl	$(CHAR_PER_VEC * 3), %esi
-	jb	L(max)
-
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	subl	$(CHAR_PER_VEC * 2), %esi
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
-	ret
-L(max_end):
-	movq	%r8, %rax
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+	 */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
-# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+	movq	%rdi, %rcx
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	/* Remove the leading bytes.  */
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	movl	%edx, %ecx
-	shrl	$2, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-# endif
-	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
-	sarxl	%SHIFT_REG, %eax, %eax
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
 	testl	%eax, %eax
-# ifndef USE_AS_STRNLEN
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	ret
 # else
-	jnz	L(cross_page_less_vec)
-#  ifndef USE_AS_WCSLEN
-	movl	%edx, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-#  endif
-	movl	$CHAR_PER_VEC, %eax
-	subl	%ecx, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	ja	L(cross_page_continue)
-	movl	%esi, %eax
-	ret
-L(cross_page_less_vec):
-	tzcntl	%eax, %eax
-	/* Select min of length and position of first null.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
+	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 64a9fc2606..443a32a749 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@ 
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+#  define STRNLEN	__strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+
+#  define REG_WIDTH	VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+# else
+#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+	movq	%rsi, %rax
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	bsfq	%rcx, %rax
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+
+# if CHAR_PER_VEC != 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+# endif
+
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	/* Check the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jz	L(max_0)
+	/* Best place for LAST_VEC_CHECK if ZMM.  */
+	.p2align 4,, 8
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+# if CHAR_PER_VEC == 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	.p2align 4,, 8
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	.p2align 4,, 6
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
 
-#define USE_AS_STRNLEN 1
-#define STRLEN	STRNLEN
+	KMOV	%k0, %VRDX
 
-#include "strlen-evex.S"
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subl	$(CHAR_PER_VEC), %eax
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+	.p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+
+	.p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+
+	.p2align 4,, 5
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+# else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+# else
+	subq	%rdi, %rax
+# endif
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jbe	L(loop_len_end)
+
+
+	KORTEST %k0, %k2
+	jz	L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+	movq	%rsi, %rcx
+	subq	%rax, %rsi
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x0)
+
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k2, %VRDX
+# else
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rax, %rdx
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	bsfq	%rdx, %rdx
+	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
+	.p2align 4,, 4
+L(loop_len_end):
+	KORTEST %k0, %k2
+	jnz	L(loop_last_4x_vec)
+	movq	%rsi, %rax
+	ret
+
+
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 8
+L(last_vec_x2):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(last_vec_x1):
+	addq	$CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+
+	.p2align 4,, 8
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	shrx	%VRAX, %VRCX, %VRCX
+
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
index e2aad94c1e..57a7e93fbf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@ 
 # define WCSNLEN	__wcsnlen_evex
 #endif
 
-#define STRLEN	WCSNLEN
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex.S"
+#include "strnlen-evex.S"