diff mbox series

[v1,1/7] x86: Optimize memchr-evex.S and implement with VMM headers

Message ID 20221018024901.3381469-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/7] x86: Optimize memchr-evex.S and implement with VMM headers | expand

Commit Message

Noah Goldstein Oct. 18, 2022, 2:48 a.m. UTC
Optimizations are:

1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Restructure code so that small strings are given the hot path.
	- This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the memchr and
rawmemchr code essentially incompatible so split rawmemchr-evex
to a new file.

Code Size Changes:
memchr-evex.S       : -107 bytes
rawmemchr-evex.S    :  -53 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memchr-evex.S       : 0.928
rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
 3 files changed, 851 insertions(+), 410 deletions(-)

Comments

Noah Goldstein Oct. 18, 2022, 2:50 a.m. UTC | #1
On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>
> 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
>    in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the memchr and
> rawmemchr code essentially incompatible so split rawmemchr-evex
> to a new file.
>
> Code Size Changes:
> memchr-evex.S       : -107 bytes
> rawmemchr-evex.S    :  -53 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> memchr-evex.S       : 0.928
> rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
>  sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
>  sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
>  3 files changed, 851 insertions(+), 410 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 0dd4f1dcce..23a1c0018e 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -21,17 +21,27 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
>  # ifndef MEMCHR
>  #  define MEMCHR       __memchr_evex
>  # endif
>
>  # ifdef USE_AS_WMEMCHR
> +#  define PC_SHIFT_GPR rcx
> +#  define VPTESTN      vptestnmd
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPMINU       vpminud
>  #  define VPCMP        vpcmpd
>  #  define VPCMPEQ      vpcmpeqd
>  #  define CHAR_SIZE    4
> +
> +#  define USE_WIDE_CHAR
>  # else
> +#  define PC_SHIFT_GPR rdi
> +#  define VPTESTN      vptestnmb
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPMINU       vpminub
>  #  define VPCMP        vpcmpb
> @@ -39,534 +49,661 @@
>  #  define CHAR_SIZE    1
>  # endif
>
> -       /* In the 4x loop the RTM and non-RTM versions have data pointer
> -          off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
> -          This is represented by BASE_OFFSET. As well because the RTM
> -          version uses vpcmp which stores a bit per element compared where
> -          the non-RTM version uses vpcmpeq which stores a bit per byte
> -          compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
> -          version.  */
> -# ifdef USE_IN_RTM
> +# include "reg-macros.h"
> +
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
>  #  define VZEROUPPER
> -#  define BASE_OFFSET  (VEC_SIZE * 4)
> -#  define RET_SCALE    CHAR_SIZE
> +
> +#  define USE_TERN_IN_LOOP     0
>  # else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
>  #  define VZEROUPPER   vzeroupper
> -#  define BASE_OFFSET  0
> -#  define RET_SCALE    1
>  # endif
>
> -       /* In the return from 4x loop memchr and rawmemchr versions have
> -          data pointers off by VEC_SIZE * 4 with memchr version being
> -          VEC_SIZE * 4 greater.  */
> -# ifdef USE_AS_RAWMEMCHR
> -#  define RET_OFFSET   (BASE_OFFSET - (VEC_SIZE * 4))
> -#  define RAW_PTR_REG  rcx
> -#  define ALGN_PTR_REG rdi
> +# if USE_TERN_IN_LOOP
> +       /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
> +          so we don't want to multiply resulting index.  */
> +#  define TERN_CHAR_MULT       1
> +
> +#  ifdef USE_AS_WMEMCHR
> +#   define TEST_END()  inc %VRCX
> +#  else
> +#   define TEST_END()  add %rdx, %rcx
> +#  endif
>  # else
> -#  define RET_OFFSET   BASE_OFFSET
> -#  define RAW_PTR_REG  rdi
> -#  define ALGN_PTR_REG rcx
> +#  define TERN_CHAR_MULT       CHAR_SIZE
> +#  define TEST_END()   KORTEST %k2, %k3
>  # endif
>
> -# define XMMZERO       xmm23
> -# define YMMZERO       ymm23
> -# define XMMMATCH      xmm16
> -# define YMMMATCH      ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +#  ifndef USE_AS_WMEMCHR
> +#   define GPR_X0_IS_RET       1
> +#  else
> +#   define GPR_X0_IS_RET       0
> +#  endif
> +#  define GPR_X0       rax
> +# else
> +#  define GPR_X0_IS_RET        0
> +#  define GPR_X0       rdx
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# ifndef SECTION
> -#  define SECTION(p)   p##.evex
> +# if CHAR_PER_VEC == 64
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
> +# else
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
> +# endif
> +# if CHAR_PER_VEC >= 32
> +#  define MASK_GPR(...)        VGPR(__VA_ARGS__)
> +# elif CHAR_PER_VEC == 16
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 16)
> +# else
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 8)
>  # endif
>
> -# define VEC_SIZE 32
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -# define PAGE_SIZE 4096
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
>
> -       .section SECTION(.text),"ax",@progbits
> +# define PAGE_SIZE     4096
> +
> +
> +       .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN (MEMCHR, 6)
> -# ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>         test    %RDX_LP, %RDX_LP
> -       jz      L(zero)
> +       jz      L(zero_0)
>
> -#  ifdef __ILP32__
> +# ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> -#  endif
>  # endif
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       VPBROADCAST %esi, %YMMMATCH
> +       VPBROADCAST %esi, %VMATCH
>         /* Check if we may cross page boundary with one vector load.  */
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page_boundary)
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +# ifndef USE_AS_WMEMCHR
> +       /* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
> +          already a dependency between rcx and rsi so no worries about
> +          false-dep here.  */
> +       tzcnt   %VRAX, %VRSI
> +       /* If rdx <= rsi then either 1) rcx was non-zero (there was a
> +          match) but it was out of bounds or 2) rcx was zero and rdx
> +          was <= VEC_SIZE so we are done scanning.  */
> +       cmpq    %rsi, %rdx
> +       /* NB: Use branch to return zero/non-zero.  Common usage will
> +          branch on result of function (if return is null/non-null).
> +          This branch can be used to predict the ensuing one so there
> +          is no reason to extend the data-dependency with cmovcc.  */
> +       jbe     L(zero_0)
> +
> +       /* If rcx is zero then len must be > RDX, otherwise since we
> +          already tested len vs lzcnt(rcx) (in rsi) we are good to
> +          return this match.  */
> +       test    %VRAX, %VRAX
> +       jz      L(more_1x_vec)
> +       leaq    (%rdi, %rsi), %rax
> +# else
>
> -       /* Check the first VEC_SIZE bytes.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* If length < CHAR_PER_VEC handle special.  */
> +       /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
> +          > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
>         cmpq    $CHAR_PER_VEC, %rdx
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       ja      L(more_1x_vec)
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(zero_0)
> +L(first_vec_x0_ret):
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -# else
> -       addq    %rdi, %rax
>  # endif
>         ret
>
> -# ifndef USE_AS_RAWMEMCHR
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Check if first match was before length. NB: tzcnt has false data-
> -          dependency on destination. eax already had a data-dependency on esi
> -          so this should have no affect here.  */
> -       tzcntl  %eax, %esi
> -#  ifdef USE_AS_WMEMCHR
> -       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> -#  else
> -       addq    %rsi, %rdi
> -#  endif
> +       /* Only fits in first cache line for VEC_SIZE == 32.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 2
> +L(zero_0):
>         xorl    %eax, %eax
> -       cmpl    %esi, %edx
> -       cmovg   %rdi, %rax
>         ret
>  # endif
>
> -       .p2align 4
> -L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is
> -          necessary for computer return address if byte is found or
> -          adjusting length if it is not and this is memchr.  */
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> -          for rawmemchr.  */
> -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> -       VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> -       kmovd   %k0, %r8d
> +       .p2align 4,, 9
> +L(more_1x_vec):
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       sarl    $2, %eax
> -# endif
> -# ifndef USE_AS_RAWMEMCHR
> -       movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> -       subl    %eax, %esi
> +       /* If wmemchr still need to test if there was a match in first
> +          VEC.  Use bsf to test here so we can reuse
> +          L(first_vec_x0_ret).  */
> +       bsf     %VRAX, %VRAX
> +       jnz     L(first_vec_x0_ret)
>  # endif
> +
> +L(page_cross_continue):
>  # ifdef USE_AS_WMEMCHR
> -       andl    $(CHAR_PER_VEC - 1), %eax
> -# endif
> -       /* Remove the leading bytes.  */
> -       sarxl   %eax, %r8d, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check the end of data.  */
> -       cmpq    %rsi, %rdx
> -       jbe     L(first_vec_x0)
> +       /* We can't use end of the buffer to re-calculate length for
> +          wmemchr as len * CHAR_SIZE may overflow.  */
> +       leaq    -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       addq    %rdx, %rax
> +# else
> +       leaq    -(VEC_SIZE + 1)(%rdx, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
>  # endif
> -       testl   %eax, %eax
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> +
> +       /* rax contains remaining length - 1.  -1 so we can get imm8
> +          encoding in a few additional places saving code size.  */
> +
> +       /* Needed regardless of remaining length.  */
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +
> +       /* We cannot fold the above `sub %rdi, %rax` with the `cmp
> +          $(CHAR_PER_VEC * 2), %rax` because its possible for a very
> +          large length to overflow and cause the subtract to carry
> +          despite length being above CHAR_PER_VEC * 2.  */
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rax
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1_check)
> +
> +       /* Check the end of data.  NB: use 8-bit operations to save code
> +          size.  We no longer need the full-width of eax and will
> +          perform a write-only operation over eax so there will be no
> +          partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 1 - 1), %al
> +       jle     L(zero_0)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_0)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +
> +       /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
> +          fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
> +          not enough space before the next cache line to fit the `lea`
> +          for return.  */
> +# if VEC_SIZE == 64
> +       ja      L(first_vec_x2_ret)
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>  # else
> -       addq    %RAW_PTR_REG, %rax
> +       jbe     L(zero_0)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
>  # endif
> +
> +       .p2align 4,, 5
> +L(first_vec_x1_check):
> +       bsf     %VRDX, %VRDX
> +       cmpb    %dl, %al
> +       jb      L(zero_4)
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 32.
> +        */
> +# if VEC_SIZE == 32
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> +
> +       .p2align 4,, 4
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       bsf     %VRCX, %VRCX
> +L(first_vec_x2_ret):
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +        */
> +# if VEC_SIZE == 64
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> -L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 4
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 5
> -L(aligned_more):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Align data to VEC_SIZE.  */
> -L(cross_page_continue):
> -       xorl    %ecx, %ecx
> -       subl    %edi, %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       /* esi is for adjusting length to see if near the end.  */
> -       leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> -#  ifdef USE_AS_WMEMCHR
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %esi
> -#  endif
> -# else
> -       andq    $-VEC_SIZE, %rdi
> -L(cross_page_continue):
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rsi, %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -       testl   %eax, %eax
> +       .p2align 4,, 5
> +L(more_2x_vec):
> +       /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
> +          length.  */
> +
> +
> +       /* Already computed matches for first VEC in rdx.  */
> +       test    %VRDX, %VRDX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Needed regardless of next length check.  */
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* Check if we are near the end.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rax
> +       ja      L(more_4x_vec)
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       /* Use 8-bit instructions to save code size.  We won't use full-
> +          width eax again and will perform a write-only operation to
> +          eax so no worries about partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jb      L(zero_2)
> +L(last_vec_check):
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WMEMCHR
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_2)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +       jae     L(first_vec_x4_ret)
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
> +
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +          For VEC_SIZE == 32 we put the return label at the end of
> +          L(first_vec_x4).  */
> +# if VEC_SIZE == 64
> +L(first_vec_x4_ret):
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +       bsf     %VRCX, %VRCX
> +# if VEC_SIZE == 32
> +       /* Place L(first_vec_x4_ret) here as we can't fit it in the same
> +          cache line as where it is called from so we might as well
> +          save code size by reusing return of L(first_vec_x4).  */
> +L(first_vec_x4_ret):
> +# endif
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3_check):
> +       /* Need to adjust remaining length before checking.  */
> +       addb    $-(CHAR_PER_VEC * 2), %al
> +       bsf     %VRCX, %VRCX
> +       cmpb    %cl, %al
> +       jb      L(zero_2)
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 3
> +# if !USE_TERN_IN_LOOP
> +       .p2align 4,, 10
> +# endif
> +L(more_4x_vec):
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x4)
>
> +       subq    $-(VEC_SIZE * 5), %rdi
> +       subq    $(CHAR_PER_VEC * 8), %rax
> +       jb      L(last_4x_vec)
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check if at last CHAR_PER_VEC * 4 length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       jbe     L(last_4x_vec_or_less_cmpeq)
> -       /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -
> -       /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> -        */
> -#  ifdef USE_AS_WMEMCHR
> +# ifdef USE_AS_WMEMCHR
>         movl    %edi, %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +
> +# if VEC_SIZE == 64
> +       /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
> +          processor has partial register stalls (all have merging
> +          uop). If that changes this can be removed.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WMEMCHR
>         subl    %edi, %ecx
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>         sarl    $2, %ecx
> -       addq    %rcx, %rdx
> -#  else
> -       addq    %rdi, %rdx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -       subq    %rdi, %rdx
> -#  endif
> +       addq    %rcx, %rax
>  # else
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       subq    %rdi, %rax
>  # endif
> -# ifdef USE_IN_RTM
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -# else
> -       /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
> -          encodable with EVEX registers (ymm16-ymm31).  */
> -       vmovdqa64 %YMMMATCH, %ymm0
> +
> +
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
>  # endif
>
> -       /* Compare 4 * VEC at a time forward.  */
> -       .p2align 4
> +       .p2align 4,, 11
>  L(loop_4x_vec):
> -       /* Two versions of the loop. One that does not require
> -          vzeroupper by not using ymm0-ymm15 and another does that require
> -          vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
> -          is used at all is because there is no EVEX encoding vpcmpeq and
> -          with vpcmpeq this loop can be performed more efficiently. The
> -          non-vzeroupper version is safe for RTM while the vzeroupper
> -          version should be prefered if RTM are not supported.  */
> -# ifdef USE_IN_RTM
> -       /* It would be possible to save some instructions using 4x VPCMP
> -          but bottleneck on port 5 makes it not woth it.  */
> -       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> -       /* xor will set bytes match esi to zero.  */
> -       vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> -       vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> -       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> -       /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> -       VPMINU  %YMM2, %YMM3, %YMM3{%k1}{z}
> -       VPCMP   $0, %YMM3, %YMMZERO, %k2
> -# else
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymmm0-15 and another does that
> +          require vzeroupper because it uses ymmm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
>         /* Since vptern can only take 3x vectors fastest to do 1 vec
>            seperately with EVEX vpcmp.  */
>  #  ifdef USE_AS_WMEMCHR
>         /* vptern can only accept masks for epi32/epi64 so can only save
> -          instruction using not equals mask on vptern with wmemchr.  */
> -       VPCMP   $4, (%rdi), %YMMMATCH, %k1
> +          instruction using not equals mask on vptern with wmemchr.
> +        */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  else
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  endif
>         /* Compare 3x with vpcmpeq and or them all together with vptern.
>          */
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
>  #  ifdef USE_AS_WMEMCHR
> -       /* This takes the not of or between ymm2, ymm3, ymm4 as well as
> -          combines result from VEC0 with zero mask.  */
> -       vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
> -       vpmovmskb %ymm4, %ecx
> +       /* This takes the not of or between VEC_lo(2), VEC_lo(3),
> +          VEC_lo(4) as well as combines result from VEC(0) with zero
> +          mask.  */
> +       vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
> +       vpmovmskb %VMM_lo(4), %VRCX
>  #  else
> -       /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
> -       vpternlogd $254, %ymm2, %ymm3, %ymm4
> -       vpmovmskb %ymm4, %ecx
> -       kmovd   %k1, %eax
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +       KMOV    %k1, %edx
>  #  endif
> -# endif
>
> -# ifdef USE_AS_RAWMEMCHR
> -       subq    $-(VEC_SIZE * 4), %rdi
> -# endif
> -# ifdef USE_IN_RTM
> -       kortestd %k2, %k3
>  # else
> -#  ifdef USE_AS_WMEMCHR
> -       /* ecx contains not of matches. All 1s means no matches. incl will
> -          overflow and set zeroflag if that is the case.  */
> -       incl    %ecx
> -#  else
> -       /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
> -          to ecx is not an issue because if eax is non-zero it will be
> -          used for returning the match. If it is zero the add does
> -          nothing.  */
> -       addq    %rax, %rcx
> -#  endif
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
>  # endif
> -# ifdef USE_AS_RAWMEMCHR
> -       jz      L(loop_4x_vec)
> -# else
> -       jnz     L(loop_4x_vec_end)
> +
> +
> +       TEST_END ()
> +       jnz     L(loop_vec_ret)
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       ja      L(loop_4x_vec)
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jae     L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> +       /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
>          */
> -       VPCMP   $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
> -       addq    $(BASE_OFFSET - VEC_SIZE), %rdi
> -       kmovd   %k0, %eax
> -       VZEROUPPER
> -
> -L(last_4x_vec_or_less):
> -       /* Check if first VEC contained match.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       COND_VZEROUPPER
>
> -       /* If remaining length > CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jg      L(last_4x_vec)
> -
> -L(last_2x_vec):
> -       /* If remaining length < CHAR_PER_VEC.  */
> -       addl    $CHAR_PER_VEC, %edx
> -       jle     L(zero_end)
> -
> -       /* Check VEC2 and compare any match with remaining length.  */
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end):
> -       ret
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
> +          instructions on eax from here on out.  */
> +# if CHAR_PER_VEC != 64
> +       andl    $(CHAR_PER_VEC * 4 - 1), %eax
> +# endif
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
> +       subq    $(VEC_SIZE * 1), %rdi
> +       KMOV    %k0, %VRDX
> +       cmpb    $(CHAR_PER_VEC * 2 - 1), %al
> +       jbe     L(last_2x_vec)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jae     L(last_vec_check)
>
> -L(set_zero_end):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1_check):
> -       /* eax must be non-zero. Use bsfl to save code size.  */
> -       bsfl    %eax, %eax
> -       /* Adjust length.  */
> -       subl    $-(CHAR_PER_VEC * 4), %edx
> -       /* Check if match within remaining length.  */
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +       addq    $VEC_SIZE, %rdi
> +L(last_vec_x1_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
> +# endif
>
> -       .p2align 4
> -L(loop_4x_vec_end):
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
> +          64 it needs a seperate return label.  */
> +       .p2align 4,, 4
> +L(last_vec_x2):
> +L(last_vec_x2_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
> +       ret
>  # endif
> -       /* rawmemchr will fall through into this if match was found in
> -          loop.  */
>
> -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
> -       /* k1 has not of matches with VEC1.  */
> -       kmovd   %k1, %eax
> -#  ifdef USE_AS_WMEMCHR
> -       subl    $((1 << CHAR_PER_VEC) - 1), %eax
> -#  else
> -       incl    %eax
> -#  endif
> +       .p2align 4,, 4
> +L(loop_vec_ret):
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +       KMOV    %k1, %VRAX
> +       inc     %MASK_GPR(rax)
>  # else
> -       /* eax already has matches for VEC1.  */
> -       testl   %eax, %eax
> +       test    %VRDX, %VRDX
>  # endif
> -       jnz     L(last_vec_x1_return)
> +       jnz     L(last_vec_x0)
>
> -# ifdef USE_IN_RTM
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %eax
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRDX
>  # else
> -       vpmovmskb %ymm2, %eax
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRDX
>  # endif
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2_return)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
>
> -# ifdef USE_IN_RTM
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_return)
>
> -       kmovd   %k3, %eax
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRDX
>  # else
> -       vpmovmskb %ymm3, %eax
> -       /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
> -       salq    $VEC_SIZE, %rcx
> -       orq     %rcx, %rax
> -       tzcntq  %rax, %rax
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
> -       VZEROUPPER
> +       KMOV    %k2, %VRDX
>  # endif
> -       ret
>
> -       .p2align 4,, 10
> -L(last_vec_x1_return):
> -       tzcntl  %eax, %eax
> -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
> +          CHAR_PER_VEC we test the last 2x VEC seperately, for
> +          CHAR_PER_VEC <= 32 we can combine the results from the 2x
> +          VEC in a single GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k3, %VRDX
>  # else
> -       addq    %rdi, %rax
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $(VEC_SIZE / TERN_CHAR_MULT), %rcx
> +       addq    %rcx, %rdx
> +#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +#  endif
>  # endif
> -       VZEROUPPER
> +       bsf     %rdx, %rdx
> +       leaq    (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
> -          if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
> -          USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
> -       leaq    (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
> -       VZEROUPPER
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x1_novzero):
> +# endif
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -# ifdef USE_IN_RTM
> -       .p2align 4
> -L(last_vec_x3_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +
> +       .p2align 4,, 4
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VGPR(GPR_X0), %VGPR(GPR_X0)
> +# if GPR_X0_IS_RET
> +       addq    %rdi, %rax
> +# else
> +       leaq    (%rdi, %GPR_X0, CHAR_SIZE), %rax
> +# endif
>         ret
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* Need to preserve eax to compute inbound bytes we are
> +          checking.  */
> +# ifdef USE_AS_WMEMCHR
> +       movl    %eax, %ecx
> +# else
> +       xorl    %ecx, %ecx
> +       subl    %eax, %ecx
>  # endif
>
> -# ifndef USE_AS_RAWMEMCHR
> -       .p2align 4,, 5
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       /* Check first VEC regardless.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
>
> -       /* If remaining length <= CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jle     L(last_2x_vec)
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +# endif
>
> -       .p2align 4
> -L(last_4x_vec):
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
>
> +       shrx    %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Create mask for possible matches within remaining length.  */
> -#  ifdef USE_AS_WMEMCHR
> -       movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> -       bzhil   %edx, %ecx, %ecx
> -#  else
> -       movq    $-1, %rcx
> -       bzhiq   %rdx, %rcx, %rcx
> -#  endif
> -       /* Test matches in data against length match.  */
> -       andl    %ecx, %eax
> -       jnz     L(last_vec_x3)
> +# ifdef USE_AS_WMEMCHR
> +       negl    %ecx
> +# endif
>
> -       /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> -          remaining length was found to be > CHAR_PER_VEC * 2.  */
> -       subl    $CHAR_PER_VEC, %edx
> -       jbe     L(zero_end2)
> +       /* mask lower bits from ecx (negative eax) to get bytes till
> +          next VEC.  */
> +       andl    $(CHAR_PER_VEC - 1), %ecx
>
> +       /* Check if VEC is entirely contained in the remainder of the
> +          page.  */
> +       cmpq    %rcx, %rdx
> +       jbe     L(page_cross_ret)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Shift remaining length mask for last VEC.  */
> -#  ifdef USE_AS_WMEMCHR
> -       shrl    $CHAR_PER_VEC, %ecx
> -#  else
> -       shrq    $CHAR_PER_VEC, %rcx
> -#  endif
> -       andl    %ecx, %eax
> -       jz      L(zero_end2)
> -       bsfl    %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end2):
> -       ret
> +       /* Length crosses the page so if rax is zero (no matches)
> +          continue.  */
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_continue)
>
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* if rdx > rcx then any match here must be in [buf:buf + len].
> +        */
> +       tzcnt   %VRAX, %VRAX
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 2
> +L(page_cross_zero):
> +       xorl    %eax, %eax
>         ret
> +
> +       .p2align 4,, 4
> +L(page_cross_ret):
> +       /* Search is entirely contained in page cross case.  */
> +# ifdef USE_AS_WMEMCHR
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_zero)
> +# endif
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(page_cross_zero)
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
> -       /* 7 bytes from next cache line.  */
> +       ret
>  END (MEMCHR)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> index deda1ca395..2073eaa620 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> @@ -1,3 +1,6 @@
> -#define MEMCHR __rawmemchr_evex_rtm
> -#define USE_AS_RAWMEMCHR 1
> -#include "memchr-evex-rtm.S"
> +#define RAWMEMCHR      __rawmemchr_evex_rtm
> +
> +#define USE_IN_RTM     1
> +#define SECTION(p)     p##.evex.rtm
> +
> +#include "rawmemchr-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> index dc1c450699..dad54def2b 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> @@ -1,7 +1,308 @@
> -#ifndef RAWMEMCHR
> -# define RAWMEMCHR     __rawmemchr_evex
> -#endif
> -#define USE_AS_RAWMEMCHR       1
> -#define MEMCHR RAWMEMCHR
> +/* rawmemchr optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef RAWMEMCHR
> +#  define RAWMEMCHR    __rawmemchr_evex
> +# endif
> +
> +
> +# define PC_SHIFT_GPR  rdi
> +# define REG_WIDTH     VEC_SIZE
> +# define VPTESTN       vptestnmb
> +# define VPBROADCAST   vpbroadcastb
> +# define VPMINU        vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ       vpcmpeqb
> +# define CHAR_SIZE     1
> +
> +# include "reg-macros.h"
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
> +#  define VZEROUPPER
> +
> +#  define USE_TERN_IN_LOOP     0
> +# else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
> +#  define VZEROUPPER   vzeroupper
> +# endif
> +
> +# define CHAR_PER_VEC  VEC_SIZE
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else /* !(CHAR_PER_VEC == 64) */
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif        /* !(CHAR_PER_VEC == 64) */
> +
> +
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
> +
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (RAWMEMCHR, 6)
> +       VPBROADCAST %esi, %VMATCH
> +       /* Check if we may cross page boundary with one vector load.  */
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       test    %VRAX, %VRAX
> +       jz      L(aligned_more)
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +
> +       .p2align 4,, 4
> +L(first_vec_x4):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax), %rax
> +       ret
>
> -#include "memchr-evex.S"
> +       /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
> +          as well place it more locally.  For VEC_SIZE == 64 we reuse
> +          return code at the end of loop's return.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 4
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* eax has lower page-offset bits of rdi so xor will zero them
> +          out.  */
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       /* Shift out out-of-bounds matches.  */
> +       shrx    %VRDI, %VRAX, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
> +
> +       .p2align 4,, 10
> +L(aligned_more):
> +L(page_cross_continue):
> +       /* Align pointer.  */
> +       andq    $(VEC_SIZE * -1), %rdi
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x4)
> +
> +       subq    $-(VEC_SIZE * 1), %rdi
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
> +# endif
> +
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymm0-15 and another does that
> +          require vzeroupper because it uses ymm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
> +       /* Since vptern can only take 3x vectors fastest to do 1 vec
> +          seperately with EVEX vpcmp.  */
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       /* Compare 3x with vpcmpeq and or them all together with vptern.
> +        */
> +
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       subq    $(VEC_SIZE * -4), %rdi
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
> +
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +
> +       KMOV    %k1, %eax
> +
> +       /* NB:  rax has match from first VEC and rcx has matches from
> +          VEC 2-4.  If rax is non-zero we will return that match.  If
> +          rax is zero adding won't disturb the bits in rcx.  */
> +       add     %rax, %rcx
> +# else
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
> +       subq    $(VEC_SIZE * -4), %rdi
> +       KORTEST %k2, %k3
> +# endif
> +       jz      L(loop_4x_vec)
> +
> +# if USE_TERN_IN_LOOP
> +       test    %VRAX, %VRAX
> +# else
> +       KMOV    %k1, %VRAX
> +       inc     %VRAX
> +# endif
> +       jnz     L(last_vec_x0)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRAX
> +# else
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +# endif
> +       test    %VRAX, %VRAX
> +       jnz     L(last_vec_x1)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRAX
> +# else
> +       KMOV    %k2, %VRAX
> +# endif
> +
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k3, %VRAX
> +L(FALLTHROUGH_RETURN_LBL):
> +# else
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $CHAR_PER_VEC, %rcx
> +       addq    %rcx, %rax
> +# endif
> +       bsf     %rax, %rax
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(TAIL_RETURN_LBL):
> +       bsf     %rax, %rax
> +       leaq    (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +END (RAWMEMCHR)
> +#endif
> --
> 2.34.1
>
Results For: rawmemchr
alignment,char ,length ,__rawmemchr_evex ,__rawmemchr_evex_orig 
0        ,0    ,1      ,2.878            ,2.891                 ,0.996 
0        ,0    ,10     ,2.876            ,2.886                 ,0.997 
0        ,0    ,1024   ,22.832           ,23.58                 ,0.968 
0        ,0    ,11     ,2.886            ,2.887                 ,0.999 
0        ,0    ,12     ,2.864            ,2.871                 ,0.998 
0        ,0    ,128    ,5.816            ,6.014                 ,0.967 
0        ,0    ,13     ,2.854            ,2.863                 ,0.997 
0        ,0    ,14     ,2.886            ,2.865                 ,1.007 
0        ,0    ,15     ,2.863            ,2.886                 ,0.992 
0        ,0    ,16     ,2.859            ,2.857                 ,1.001 
0        ,0    ,17     ,2.848            ,2.881                 ,0.988 
0        ,0    ,18     ,2.854            ,2.865                 ,0.996 
0        ,0    ,19     ,2.878            ,2.872                 ,1.002 
0        ,0    ,2      ,2.887            ,2.9                   ,0.995 
0        ,0    ,20     ,2.857            ,2.862                 ,0.998 
0        ,0    ,21     ,2.861            ,2.86                  ,1.0   
0        ,0    ,22     ,2.854            ,2.873                 ,0.993 
0        ,0    ,23     ,2.872            ,2.861                 ,1.004 
0        ,0    ,24     ,2.853            ,2.855                 ,0.999 
0        ,0    ,25     ,2.85             ,2.853                 ,0.999 
0        ,0    ,256    ,10.355           ,10.703                ,0.968 
0        ,0    ,26     ,2.86             ,2.853                 ,1.002 
0        ,0    ,27     ,2.846            ,2.861                 ,0.995 
0        ,0    ,28     ,2.849            ,2.861                 ,0.996 
0        ,0    ,29     ,2.867            ,2.868                 ,1.0   
0        ,0    ,3      ,2.863            ,2.892                 ,0.99  
0        ,0    ,30     ,2.855            ,2.869                 ,0.995 
0        ,0    ,31     ,2.842            ,2.867                 ,0.991 
0        ,0    ,32     ,4.245            ,4.28                  ,0.992 
0        ,0    ,4      ,2.875            ,2.894                 ,0.994 
0        ,0    ,5      ,2.887            ,2.893                 ,0.998 
0        ,0    ,512    ,14.736           ,15.229                ,0.968 
0        ,0    ,6      ,2.876            ,2.868                 ,1.003 
0        ,0    ,64     ,4.957            ,4.968                 ,0.998 
0        ,0    ,7      ,2.893            ,2.88                  ,1.004 
0        ,0    ,8      ,2.856            ,2.867                 ,0.996 
0        ,0    ,9      ,2.872            ,2.885                 ,0.996 
0        ,23   ,1      ,2.826            ,2.859                 ,0.988 
0        ,23   ,10     ,2.861            ,2.876                 ,0.995 
0        ,23   ,1023   ,21.322           ,22.016                ,0.968 
0        ,23   ,1024   ,22.76            ,23.532                ,0.967 
0        ,23   ,11     ,2.872            ,2.875                 ,0.999 
0        ,23   ,12     ,2.872            ,2.881                 ,0.997 
0        ,23   ,127    ,5.293            ,5.38                  ,0.984 
0        ,23   ,1279   ,24.974           ,25.923                ,0.963 
0        ,23   ,128    ,5.904            ,5.683                 ,1.039 
0        ,23   ,1280   ,26.229           ,27.041                ,0.97  
0        ,23   ,13     ,2.878            ,2.87                  ,1.003 
0        ,23   ,14     ,2.843            ,2.87                  ,0.991 
0        ,23   ,15     ,2.864            ,2.873                 ,0.997 
0        ,23   ,1535   ,28.787           ,29.899                ,0.963 
0        ,23   ,1536   ,30.286           ,31.62                 ,0.958 
0        ,23   ,159    ,6.12             ,6.081                 ,1.006 
0        ,23   ,16     ,2.879            ,2.868                 ,1.004 
0        ,23   ,160    ,8.965            ,9.035                 ,0.992 
0        ,23   ,17     ,2.861            ,2.884                 ,0.992 
0        ,23   ,1791   ,32.274           ,33.92                 ,0.951 
0        ,23   ,1792   ,33.701           ,35.386                ,0.952 
0        ,23   ,18     ,2.861            ,2.873                 ,0.996 
0        ,23   ,19     ,2.848            ,2.865                 ,0.994 
0        ,23   ,191    ,8.858            ,9.03                  ,0.981 
0        ,23   ,192    ,9.255            ,9.801                 ,0.944 
0        ,23   ,2      ,2.889            ,2.897                 ,0.997 
0        ,23   ,20     ,2.843            ,2.846                 ,0.999 
0        ,23   ,2047   ,36.33            ,37.384                ,0.972 
0        ,23   ,2048   ,37.147           ,38.863                ,0.956 
0        ,23   ,21     ,2.855            ,2.86                  ,0.998 
0        ,23   ,22     ,2.843            ,2.846                 ,0.999 
0        ,23   ,223    ,8.993            ,9.551                 ,0.942 
0        ,23   ,224    ,9.1              ,9.656                 ,0.942 
0        ,23   ,23     ,2.847            ,2.852                 ,0.998 
0        ,23   ,24     ,2.854            ,2.854                 ,1.0   
0        ,23   ,25     ,2.863            ,2.873                 ,0.996 
0        ,23   ,255    ,9.087            ,9.693                 ,0.938 
0        ,23   ,2559   ,50.009           ,57.564                ,0.869 
0        ,23   ,256    ,10.385           ,10.78                 ,0.963 
0        ,23   ,2560   ,44.992           ,49.487                ,0.909 
0        ,23   ,26     ,2.859            ,2.86                  ,0.999 
0        ,23   ,27     ,2.856            ,2.861                 ,0.998 
0        ,23   ,28     ,2.862            ,2.853                 ,1.003 
0        ,23   ,29     ,2.853            ,2.851                 ,1.001 
0        ,23   ,3      ,2.89             ,2.917                 ,0.991 
0        ,23   ,30     ,2.871            ,2.888                 ,0.994 
0        ,23   ,3071   ,70.078           ,66.366                ,1.056 
0        ,23   ,3072   ,51.136           ,54.752                ,0.934 
0        ,23   ,31     ,2.848            ,2.857                 ,0.997 
0        ,23   ,319    ,10.808           ,11.072                ,0.976 
0        ,23   ,32     ,4.202            ,4.195                 ,1.002 
0        ,23   ,320    ,11.071           ,11.839                ,0.935 
0        ,23   ,3583   ,82.389           ,81.245                ,1.014 
0        ,23   ,3584   ,58.072           ,62.416                ,0.93  
0        ,23   ,383    ,11.152           ,11.866                ,0.94  
0        ,23   ,384    ,12.533           ,12.761                ,0.982 
0        ,23   ,4      ,2.868            ,2.892                 ,0.992 
0        ,23   ,447    ,12.916           ,13.313                ,0.97  
0        ,23   ,448    ,13.303           ,13.954                ,0.953 
0        ,23   ,5      ,2.885            ,2.875                 ,1.004 
0        ,23   ,511    ,13.28            ,13.871                ,0.957 
0        ,23   ,512    ,14.792           ,15.284                ,0.968 
0        ,23   ,6      ,2.857            ,2.87                  ,0.995 
0        ,23   ,63     ,4.277            ,4.283                 ,0.999 
0        ,23   ,639    ,15.31            ,16.14                 ,0.949 
0        ,23   ,64     ,4.961            ,4.961                 ,1.0   
0        ,23   ,640    ,16.757           ,17.581                ,0.953 
0        ,23   ,7      ,2.881            ,2.875                 ,1.002 
0        ,23   ,767    ,17.31            ,18.654                ,0.928 
0        ,23   ,768    ,19.421           ,19.879                ,0.977 
0        ,23   ,8      ,2.871            ,2.878                 ,0.998 
0        ,23   ,895    ,19.345           ,20.32                 ,0.952 
0        ,23   ,896    ,21.683           ,21.331                ,1.017 
0        ,23   ,9      ,2.904            ,2.868                 ,1.013 
0        ,23   ,95     ,4.989            ,4.945                 ,1.009 
0        ,23   ,96     ,5.382            ,5.098                 ,1.056 
1        ,0    ,64     ,4.945            ,4.953                 ,0.998 
1        ,23   ,64     ,4.998            ,4.95                  ,1.01  
2        ,0    ,64     ,4.92             ,4.939                 ,0.996 
2        ,23   ,64     ,4.95             ,4.957                 ,0.999 
3        ,0    ,64     ,4.964            ,4.954                 ,1.002 
3        ,23   ,64     ,4.943            ,4.978                 ,0.993 
4        ,0    ,64     ,4.981            ,4.968                 ,1.003 
4        ,23   ,64     ,4.949            ,4.969                 ,0.996 
5        ,0    ,64     ,4.923            ,4.932                 ,0.998 
5        ,23   ,64     ,4.931            ,4.931                 ,1.0   
6        ,0    ,64     ,4.794            ,4.799                 ,0.999 
6        ,23   ,64     ,4.803            ,4.8                   ,1.001 
0.9859952989629946
Results For: memchr
align,invert_pos ,len  ,pos  ,seek_char ,__memchr_evex ,__memchr_evex_orig 
0    ,0          ,0    ,1    ,0         ,3.473         ,4.166              ,0.834 
0    ,0          ,0    ,1    ,23        ,3.505         ,4.181              ,0.838 
0    ,0          ,1    ,2    ,0         ,3.488         ,3.485              ,1.001 
0    ,0          ,1    ,2    ,23        ,3.472         ,3.469              ,1.001 
0    ,0          ,10   ,11   ,0         ,3.665         ,4.443              ,0.825 
0    ,0          ,10   ,11   ,23        ,3.485         ,3.856              ,0.904 
0    ,0          ,10   ,9    ,0         ,3.646         ,3.872              ,0.942 
0    ,0          ,10   ,9    ,23        ,3.661         ,3.771              ,0.971 
0    ,0          ,1024 ,1024 ,23        ,21.347        ,20.117             ,1.061 
0    ,0          ,1024 ,1056 ,23        ,21.66         ,20.361             ,1.064 
0    ,0          ,1024 ,1088 ,23        ,22.226        ,20.41              ,1.089 
0    ,0          ,1024 ,1120 ,23        ,21.754        ,20.29              ,1.072 
0    ,0          ,1024 ,1152 ,23        ,21.777        ,20.303             ,1.073 
0    ,0          ,1024 ,1184 ,23        ,21.532        ,20.325             ,1.059 
0    ,0          ,1024 ,1216 ,23        ,21.862        ,20.278             ,1.078 
0    ,0          ,1024 ,1248 ,23        ,21.539        ,20.218             ,1.065 
0    ,0          ,1024 ,1280 ,23        ,21.725        ,20.265             ,1.072 
0    ,0          ,1024 ,1312 ,23        ,21.756        ,20.352             ,1.069 
0    ,0          ,1024 ,1344 ,23        ,21.772        ,20.247             ,1.075 
0    ,0          ,1024 ,1376 ,23        ,21.542        ,20.363             ,1.058 
0    ,0          ,1024 ,1408 ,23        ,21.573        ,20.319             ,1.062 
0    ,0          ,1024 ,640  ,23        ,16.42         ,16.53              ,0.993 
0    ,0          ,1024 ,672  ,23        ,16.664        ,16.655             ,1.001 
0    ,0          ,1024 ,704  ,23        ,17.763        ,17.228             ,1.031 
0    ,0          ,1024 ,736  ,23        ,18.094        ,17.306             ,1.046 
0    ,0          ,1024 ,768  ,23        ,18.683        ,18.971             ,0.985 
0    ,0          ,1024 ,800  ,23        ,18.738        ,18.792             ,0.997 
0    ,0          ,1024 ,832  ,23        ,19.831        ,19.277             ,1.029 
0    ,0          ,1024 ,864  ,23        ,19.749        ,19.052             ,1.037 
0    ,0          ,1024 ,896  ,23        ,20.025        ,19.218             ,1.042 
0    ,0          ,1024 ,928  ,23        ,21.18         ,19.66              ,1.077 
0    ,0          ,1024 ,960  ,23        ,20.96         ,21.487             ,0.975 
0    ,0          ,1024 ,992  ,23        ,22.066        ,20.802             ,1.061 
0    ,0          ,1056 ,1024 ,23        ,21.801        ,20.757             ,1.05  
0    ,0          ,1088 ,1024 ,23        ,21.457        ,20.95              ,1.024 
0    ,0          ,11   ,10   ,0         ,3.617         ,3.812              ,0.949 
0    ,0          ,11   ,10   ,23        ,3.701         ,3.848              ,0.962 
0    ,0          ,11   ,12   ,0         ,3.482         ,3.759              ,0.926 
0    ,0          ,11   ,12   ,23        ,3.513         ,3.78               ,0.929 
0    ,0          ,112  ,16   ,23        ,3.56          ,3.807              ,0.935 
0    ,0          ,1120 ,1024 ,23        ,21.753        ,20.777             ,1.047 
0    ,0          ,1152 ,1024 ,23        ,21.724        ,20.948             ,1.037 
0    ,0          ,1184 ,1024 ,23        ,22.588        ,22.291             ,1.013 
0    ,0          ,12   ,11   ,0         ,3.588         ,3.76               ,0.954 
0    ,0          ,12   ,11   ,23        ,3.737         ,3.853              ,0.97  
0    ,0          ,12   ,13   ,0         ,3.504         ,3.843              ,0.912 
0    ,0          ,12   ,13   ,23        ,3.498         ,3.807              ,0.919 
0    ,0          ,1216 ,1024 ,23        ,22.525        ,22.172             ,1.016 
0    ,0          ,1248 ,1024 ,23        ,22.882        ,22.391             ,1.022 
0    ,0          ,128  ,128  ,23        ,5.46          ,6.528              ,0.836 
0    ,0          ,128  ,160  ,23        ,5.622         ,6.848              ,0.821 
0    ,0          ,128  ,192  ,23        ,5.653         ,6.872              ,0.823 
0    ,0          ,128  ,224  ,23        ,6.018         ,7.722              ,0.779 
0    ,0          ,128  ,256  ,23        ,5.693         ,6.915              ,0.823 
0    ,0          ,128  ,288  ,23        ,5.669         ,7.024              ,0.807 
0    ,0          ,128  ,32   ,23        ,4.641         ,5.73               ,0.81  
0    ,0          ,128  ,320  ,23        ,5.588         ,6.872              ,0.813 
0    ,0          ,128  ,352  ,23        ,5.571         ,6.87               ,0.811 
0    ,0          ,128  ,384  ,23        ,5.61          ,6.913              ,0.811 
0    ,0          ,128  ,416  ,23        ,5.545         ,6.835              ,0.811 
0    ,0          ,128  ,448  ,23        ,5.586         ,6.908              ,0.809 
0    ,0          ,128  ,480  ,23        ,5.59          ,6.674              ,0.837 
0    ,0          ,128  ,512  ,23        ,5.58          ,6.76               ,0.825 
0    ,0          ,128  ,64   ,23        ,5.036         ,6.123              ,0.823 
0    ,0          ,128  ,96   ,23        ,6.141         ,6.397              ,0.96  
0    ,0          ,1280 ,1024 ,23        ,22.328        ,22.221             ,1.005 
0    ,0          ,13   ,12   ,0         ,3.551         ,3.81               ,0.932 
0    ,0          ,13   ,12   ,23        ,3.644         ,3.956              ,0.921 
0    ,0          ,13   ,14   ,0         ,3.498         ,3.775              ,0.926 
0    ,0          ,13   ,14   ,23        ,3.489         ,3.785              ,0.922 
0    ,0          ,1312 ,1024 ,23        ,22.724        ,22.229             ,1.022 
0    ,0          ,1344 ,1024 ,23        ,22.405        ,22.205             ,1.009 
0    ,0          ,1376 ,1024 ,23        ,22.286        ,22.346             ,0.997 
0    ,0          ,14   ,13   ,0         ,3.548         ,3.805              ,0.932 
0    ,0          ,14   ,13   ,23        ,3.612         ,3.9                ,0.926 
0    ,0          ,14   ,15   ,0         ,3.491         ,3.771              ,0.926 
0    ,0          ,14   ,15   ,23        ,3.507         ,3.819              ,0.918 
0    ,0          ,1408 ,1024 ,23        ,22.468        ,22.266             ,1.009 
0    ,0          ,144  ,16   ,23        ,3.633         ,3.828              ,0.949 
0    ,0          ,15   ,14   ,0         ,3.642         ,3.863              ,0.943 
0    ,0          ,15   ,14   ,23        ,3.69          ,3.832              ,0.963 
0    ,0          ,15   ,16   ,0         ,3.501         ,3.894              ,0.899 
0    ,0          ,15   ,16   ,23        ,3.611         ,3.839              ,0.941 
0    ,0          ,16   ,112  ,23        ,3.497         ,3.909              ,0.895 
0    ,0          ,16   ,144  ,23        ,3.501         ,3.925              ,0.892 
0    ,0          ,16   ,15   ,0         ,3.658         ,3.857              ,0.948 
0    ,0          ,16   ,15   ,23        ,3.87          ,3.787              ,1.022 
0    ,0          ,16   ,16   ,23        ,3.425         ,3.711              ,0.923 
0    ,0          ,16   ,17   ,0         ,3.5           ,3.848              ,0.909 
0    ,0          ,16   ,17   ,23        ,3.494         ,3.82               ,0.914 
0    ,0          ,16   ,176  ,23        ,3.476         ,3.88               ,0.896 
0    ,0          ,16   ,208  ,23        ,3.464         ,3.799              ,0.912 
0    ,0          ,16   ,240  ,23        ,3.468         ,3.85               ,0.901 
0    ,0          ,16   ,272  ,23        ,3.516         ,3.848              ,0.914 
0    ,0          ,16   ,304  ,23        ,3.497         ,3.869              ,0.904 
0    ,0          ,16   ,336  ,23        ,3.491         ,3.822              ,0.913 
0    ,0          ,16   ,368  ,23        ,3.484         ,3.798              ,0.917 
0    ,0          ,16   ,400  ,23        ,3.493         ,3.877              ,0.901 
0    ,0          ,16   ,48   ,23        ,3.48          ,3.823              ,0.91  
0    ,0          ,16   ,80   ,23        ,3.497         ,3.868              ,0.904 
0    ,0          ,160  ,128  ,23        ,6.651         ,7.158              ,0.929 
0    ,0          ,160  ,256  ,23        ,6.136         ,7.605              ,0.807 
0    ,0          ,160  ,32   ,23        ,4.882         ,5.71               ,0.855 
0    ,0          ,160  ,512  ,23        ,6.102         ,6.676              ,0.914 
0    ,0          ,160  ,64   ,23        ,5.311         ,6.122              ,0.867 
0    ,0          ,1664 ,2048 ,23        ,31.73         ,29.774             ,1.066 
0    ,0          ,1696 ,2048 ,23        ,31.282        ,29.567             ,1.058 
0    ,0          ,17   ,16   ,0         ,3.66          ,3.868              ,0.946 
0    ,0          ,17   ,16   ,23        ,3.803         ,3.855              ,0.986 
0    ,0          ,17   ,18   ,0         ,3.477         ,3.893              ,0.893 
0    ,0          ,17   ,18   ,23        ,3.475         ,3.809              ,0.912 
0    ,0          ,1728 ,2048 ,23        ,32.093        ,30.336             ,1.058 
0    ,0          ,176  ,16   ,23        ,3.665         ,3.884              ,0.944 
0    ,0          ,1760 ,2048 ,23        ,32.968        ,30.894             ,1.067 
0    ,0          ,1792 ,2048 ,23        ,33.445        ,31.817             ,1.051 
0    ,0          ,18   ,17   ,0         ,3.701         ,3.785              ,0.978 
0    ,0          ,18   ,17   ,23        ,3.743         ,3.833              ,0.977 
0    ,0          ,18   ,19   ,0         ,3.478         ,3.837              ,0.907 
0    ,0          ,18   ,19   ,23        ,3.463         ,3.868              ,0.895 
0    ,0          ,1824 ,2048 ,23        ,33.291        ,31.768             ,1.048 
0    ,0          ,1856 ,2048 ,23        ,33.922        ,32.431             ,1.046 
0    ,0          ,1888 ,2048 ,23        ,35.392        ,33.135             ,1.068 
0    ,0          ,19   ,18   ,0         ,3.616         ,3.791              ,0.954 
0    ,0          ,19   ,18   ,23        ,3.813         ,3.807              ,1.002 
0    ,0          ,19   ,20   ,0         ,3.465         ,3.795              ,0.913 
0    ,0          ,19   ,20   ,23        ,3.458         ,3.811              ,0.907 
0    ,0          ,192  ,128  ,23        ,6.158         ,6.144              ,1.002 
0    ,0          ,192  ,256  ,23        ,7.663         ,7.608              ,1.007 
0    ,0          ,192  ,32   ,23        ,4.818         ,5.133              ,0.939 
0    ,0          ,192  ,512  ,23        ,7.465         ,7.249              ,1.03  
0    ,0          ,192  ,64   ,23        ,5.125         ,5.188              ,0.988 
0    ,0          ,1920 ,2048 ,23        ,35.59         ,33.388             ,1.066 
0    ,0          ,1952 ,2048 ,23        ,35.15         ,33.167             ,1.06  
0    ,0          ,1984 ,2048 ,23        ,35.715        ,33.95              ,1.052 
0    ,0          ,2    ,1    ,0         ,3.496         ,3.642              ,0.96  
0    ,0          ,2    ,1    ,23        ,3.466         ,3.444              ,1.007 
0    ,0          ,2    ,3    ,0         ,3.501         ,3.677              ,0.952 
0    ,0          ,2    ,3    ,23        ,3.553         ,3.604              ,0.986 
0    ,0          ,20   ,19   ,0         ,3.573         ,3.804              ,0.939 
0    ,0          ,20   ,19   ,23        ,3.815         ,3.834              ,0.995 
0    ,0          ,20   ,21   ,0         ,3.481         ,3.778              ,0.921 
0    ,0          ,20   ,21   ,23        ,3.481         ,3.833              ,0.908 
0    ,0          ,2016 ,2048 ,23        ,36.429        ,34.281             ,1.063 
0    ,0          ,2048 ,1024 ,0         ,23.047        ,22.507             ,1.024 
0    ,0          ,2048 ,1024 ,23        ,22.719        ,22.414             ,1.014 
0    ,0          ,2048 ,128  ,0         ,6.151         ,6.026              ,1.021 
0    ,0          ,2048 ,128  ,23        ,6.186         ,6.083              ,1.017 
0    ,0          ,2048 ,1664 ,23        ,32.613        ,31.399             ,1.039 
0    ,0          ,2048 ,1696 ,23        ,32.519        ,31.396             ,1.036 
0    ,0          ,2048 ,1728 ,23        ,34.272        ,32.097             ,1.068 
0    ,0          ,2048 ,1760 ,23        ,33.56         ,32.092             ,1.046 
0    ,0          ,2048 ,1792 ,23        ,34.325        ,35.3               ,0.972 
0    ,0          ,2048 ,1824 ,23        ,34.551        ,33.401             ,1.034 
0    ,0          ,2048 ,1856 ,23        ,35.717        ,34.195             ,1.044 
0    ,0          ,2048 ,1888 ,23        ,35.653        ,34.074             ,1.046 
0    ,0          ,2048 ,1920 ,23        ,35.127        ,33.787             ,1.04  
0    ,0          ,2048 ,1952 ,23        ,37.31         ,33.955             ,1.099 
0    ,0          ,2048 ,1984 ,23        ,36.119        ,36.15              ,0.999 
0    ,0          ,2048 ,2016 ,23        ,37.774        ,35.764             ,1.056 
0    ,0          ,2048 ,2048 ,0         ,37.794        ,35.197             ,1.074 
0    ,0          ,2048 ,2048 ,23        ,37.135        ,34.502             ,1.076 
0    ,0          ,2048 ,2080 ,23        ,37.593        ,34.836             ,1.079 
0    ,0          ,2048 ,2112 ,23        ,37.494        ,34.934             ,1.073 
0    ,0          ,2048 ,2144 ,23        ,37.47         ,35.042             ,1.069 
0    ,0          ,2048 ,2176 ,23        ,37.51         ,34.77              ,1.079 
0    ,0          ,2048 ,2208 ,23        ,37.512        ,34.873             ,1.076 
0    ,0          ,2048 ,2240 ,23        ,37.81         ,35.223             ,1.073 
0    ,0          ,2048 ,2272 ,23        ,37.648        ,34.795             ,1.082 
0    ,0          ,2048 ,2304 ,23        ,37.628        ,34.938             ,1.077 
0    ,0          ,2048 ,2336 ,23        ,37.607        ,34.815             ,1.08  
0    ,0          ,2048 ,2368 ,23        ,37.661        ,34.828             ,1.081 
0    ,0          ,2048 ,2400 ,23        ,37.711        ,34.934             ,1.08  
0    ,0          ,2048 ,2432 ,23        ,37.428        ,34.937             ,1.071 
0    ,0          ,2048 ,256  ,0         ,10.418        ,10.646             ,0.979 
0    ,0          ,2048 ,256  ,23        ,10.448        ,10.688             ,0.978 
0    ,0          ,2048 ,32   ,0         ,4.639         ,5.259              ,0.882 
0    ,0          ,2048 ,32   ,23        ,4.822         ,5.232              ,0.922 
0    ,0          ,2048 ,512  ,0         ,14.497        ,14.909             ,0.972 
0    ,0          ,2048 ,512  ,23        ,14.652        ,14.994             ,0.977 
0    ,0          ,2048 ,64   ,0         ,5.159         ,5.176              ,0.997 
0    ,0          ,2048 ,64   ,23        ,5.135         ,5.157              ,0.996 
0    ,0          ,208  ,16   ,23        ,3.6           ,3.935              ,0.915 
0    ,0          ,2080 ,2048 ,23        ,37.366        ,35.59              ,1.05  
0    ,0          ,21   ,20   ,0         ,3.618         ,3.93               ,0.921 
0    ,0          ,21   ,20   ,23        ,3.826         ,3.756              ,1.019 
0    ,0          ,21   ,22   ,0         ,3.456         ,3.754              ,0.92  
0    ,0          ,21   ,22   ,23        ,3.421         ,3.825              ,0.895 
0    ,0          ,2112 ,2048 ,23        ,37.713        ,35.722             ,1.056 
0    ,0          ,2144 ,2048 ,23        ,37.058        ,35.878             ,1.033 
0    ,0          ,2176 ,2048 ,23        ,37.001        ,35.798             ,1.034 
0    ,0          ,22   ,21   ,0         ,3.53          ,3.708              ,0.952 
0    ,0          ,22   ,21   ,23        ,3.705         ,3.821              ,0.97  
0    ,0          ,22   ,23   ,0         ,3.385         ,3.744              ,0.904 
0    ,0          ,22   ,23   ,23        ,3.6           ,4.397              ,0.819 
0    ,0          ,2208 ,2048 ,23        ,37.641        ,37.406             ,1.006 
0    ,0          ,224  ,128  ,23        ,6.174         ,6.209              ,0.994 
0    ,0          ,224  ,256  ,23        ,8.043         ,8.168              ,0.985 
0    ,0          ,224  ,32   ,23        ,5.2           ,5.013              ,1.037 
0    ,0          ,224  ,512  ,23        ,7.923         ,7.845              ,1.01  
0    ,0          ,224  ,64   ,23        ,5.059         ,5.266              ,0.961 
0    ,0          ,2240 ,2048 ,23        ,38.457        ,37.305             ,1.031 
0    ,0          ,2272 ,2048 ,23        ,38.433        ,37.216             ,1.033 
0    ,0          ,23   ,22   ,0         ,3.593         ,3.725              ,0.964 
0    ,0          ,23   ,22   ,23        ,3.689         ,3.827              ,0.964 
0    ,0          ,23   ,24   ,0         ,3.422         ,3.765              ,0.909 
0    ,0          ,23   ,24   ,23        ,3.445         ,3.745              ,0.92  
0    ,0          ,2304 ,2048 ,23        ,37.974        ,37.383             ,1.016 
0    ,0          ,2336 ,2048 ,23        ,38.69         ,37.569             ,1.03  
0    ,0          ,2368 ,2048 ,23        ,38.716        ,37.644             ,1.028 
0    ,0          ,24   ,23   ,0         ,3.549         ,3.806              ,0.932 
0    ,0          ,24   ,23   ,23        ,3.738         ,3.762              ,0.994 
0    ,0          ,24   ,25   ,0         ,3.342         ,3.681              ,0.908 
0    ,0          ,24   ,25   ,23        ,3.341         ,3.823              ,0.874 
0    ,0          ,240  ,16   ,23        ,3.642         ,3.859              ,0.944 
0    ,0          ,2400 ,2048 ,23        ,38.162        ,37.283             ,1.024 
0    ,0          ,2432 ,2048 ,23        ,38.212        ,37.582             ,1.017 
0    ,0          ,25   ,24   ,0         ,3.61          ,3.795              ,0.951 
0    ,0          ,25   ,24   ,23        ,3.695         ,3.769              ,0.98  
0    ,0          ,25   ,26   ,0         ,3.351         ,3.7                ,0.906 
0    ,0          ,25   ,26   ,23        ,3.322         ,3.734              ,0.89  
0    ,0          ,256  ,128  ,23        ,6.204         ,6.079              ,1.02  
0    ,0          ,256  ,160  ,23        ,7.927         ,7.624              ,1.04  
0    ,0          ,256  ,192  ,23        ,7.865         ,7.782              ,1.011 
0    ,0          ,256  ,224  ,23        ,8.83          ,8.766              ,1.007 
0    ,0          ,256  ,256  ,23        ,8.367         ,8.437              ,0.992 
0    ,0          ,256  ,288  ,23        ,8.523         ,8.537              ,0.998 
0    ,0          ,256  ,32   ,23        ,5.07          ,5.007              ,1.013 
0    ,0          ,256  ,320  ,23        ,8.523         ,8.604              ,0.991 
0    ,0          ,256  ,352  ,23        ,8.611         ,8.629              ,0.998 
0    ,0          ,256  ,384  ,23        ,8.541         ,8.495              ,1.005 
0    ,0          ,256  ,416  ,23        ,8.723         ,8.63               ,1.011 
0    ,0          ,256  ,448  ,23        ,8.598         ,8.623              ,0.997 
0    ,0          ,256  ,480  ,23        ,8.498         ,8.622              ,0.986 
0    ,0          ,256  ,512  ,23        ,8.532         ,8.632              ,0.988 
0    ,0          ,256  ,544  ,23        ,9.267         ,8.599              ,1.078 
0    ,0          ,256  ,576  ,23        ,9.163         ,8.699              ,1.053 
0    ,0          ,256  ,608  ,23        ,9.201         ,8.691              ,1.059 
0    ,0          ,256  ,64   ,23        ,5.013         ,5.26               ,0.953 
0    ,0          ,256  ,640  ,23        ,8.489         ,8.643              ,0.982 
0    ,0          ,256  ,96   ,23        ,6.429         ,5.756              ,1.117 
0    ,0          ,26   ,25   ,0         ,3.485         ,3.71               ,0.939 
0    ,0          ,26   ,25   ,23        ,3.535         ,3.742              ,0.945 
0    ,0          ,26   ,27   ,0         ,3.351         ,3.728              ,0.899 
0    ,0          ,26   ,27   ,23        ,3.344         ,3.826              ,0.874 
0    ,0          ,27   ,26   ,0         ,3.462         ,3.683              ,0.94  
0    ,0          ,27   ,26   ,23        ,3.602         ,3.81               ,0.945 
0    ,0          ,27   ,28   ,0         ,3.326         ,3.716              ,0.895 
0    ,0          ,27   ,28   ,23        ,3.313         ,3.698              ,0.896 
0    ,0          ,272  ,16   ,23        ,3.603         ,3.867              ,0.932 
0    ,0          ,28   ,27   ,0         ,3.445         ,3.714              ,0.927 
0    ,0          ,28   ,27   ,23        ,3.553         ,3.789              ,0.938 
0    ,0          ,28   ,29   ,0         ,3.287         ,3.739              ,0.879 
0    ,0          ,28   ,29   ,23        ,3.286         ,3.753              ,0.875 
0    ,0          ,288  ,128  ,23        ,6.189         ,6.001              ,1.031 
0    ,0          ,288  ,256  ,23        ,9.392         ,9.63               ,0.975 
0    ,0          ,288  ,32   ,23        ,5.028         ,5.029              ,1.0   
0    ,0          ,288  ,512  ,23        ,9.082         ,9.382              ,0.968 
0    ,0          ,288  ,64   ,23        ,5.107         ,5.276              ,0.968 
0    ,0          ,29   ,28   ,0         ,3.467         ,3.703              ,0.936 
0    ,0          ,29   ,28   ,23        ,3.643         ,3.785              ,0.962 
0    ,0          ,29   ,30   ,0         ,3.279         ,3.69               ,0.889 
0    ,0          ,29   ,30   ,23        ,3.263         ,3.705              ,0.881 
0    ,0          ,3    ,2    ,0         ,3.483         ,3.75               ,0.929 
0    ,0          ,3    ,2    ,23        ,3.549         ,3.791              ,0.936 
0    ,0          ,3    ,4    ,0         ,3.499         ,3.615              ,0.968 
0    ,0          ,3    ,4    ,23        ,3.492         ,3.616              ,0.966 
0    ,0          ,30   ,29   ,0         ,3.455         ,3.746              ,0.922 
0    ,0          ,30   ,29   ,23        ,3.643         ,3.797              ,0.959 
0    ,0          ,30   ,31   ,0         ,3.309         ,3.704              ,0.893 
0    ,0          ,30   ,31   ,23        ,3.302         ,3.801              ,0.869 
0    ,0          ,304  ,16   ,23        ,3.571         ,3.965              ,0.901 
0    ,0          ,31   ,30   ,0         ,3.428         ,3.748              ,0.915 
0    ,0          ,31   ,30   ,23        ,3.511         ,3.755              ,0.935 
0    ,0          ,32   ,128  ,23        ,3.28          ,3.702              ,0.886 
0    ,0          ,32   ,160  ,23        ,3.308         ,3.702              ,0.894 
0    ,0          ,32   ,192  ,23        ,3.296         ,3.756              ,0.878 
0    ,0          ,32   ,224  ,23        ,3.31          ,3.707              ,0.893 
0    ,0          ,32   ,256  ,23        ,3.314         ,3.715              ,0.892 
0    ,0          ,32   ,288  ,23        ,3.324         ,3.737              ,0.889 
0    ,0          ,32   ,31   ,0         ,3.458         ,3.752              ,0.922 
0    ,0          ,32   ,31   ,23        ,3.456         ,3.7                ,0.934 
0    ,0          ,32   ,32   ,23        ,3.23          ,3.643              ,0.887 
0    ,0          ,32   ,320  ,23        ,3.334         ,3.673              ,0.908 
0    ,0          ,32   ,352  ,23        ,3.324         ,3.728              ,0.892 
0    ,0          ,32   ,384  ,23        ,3.311         ,3.713              ,0.892 
0    ,0          ,32   ,416  ,23        ,3.34          ,3.676              ,0.908 
0    ,0          ,32   ,64   ,23        ,3.285         ,3.673              ,0.895 
0    ,0          ,32   ,96   ,23        ,3.3           ,3.67               ,0.899 
0    ,0          ,320  ,128  ,23        ,6.128         ,5.986              ,1.024 
0    ,0          ,320  ,256  ,23        ,10.255        ,9.859              ,1.04  
0    ,0          ,320  ,32   ,23        ,5.226         ,5.063              ,1.032 
0    ,0          ,320  ,512  ,23        ,10.38         ,10.25              ,1.013 
0    ,0          ,320  ,64   ,23        ,5.062         ,5.193              ,0.975 
0    ,0          ,336  ,16   ,23        ,3.592         ,3.963              ,0.906 
0    ,0          ,352  ,128  ,23        ,6.197         ,6.048              ,1.025 
0    ,0          ,352  ,256  ,23        ,10.583        ,10.571             ,1.001 
0    ,0          ,352  ,32   ,23        ,5.248         ,5.028              ,1.044 
0    ,0          ,352  ,512  ,23        ,10.823        ,10.873             ,0.995 
0    ,0          ,352  ,64   ,23        ,5.071         ,5.202              ,0.975 
0    ,0          ,368  ,16   ,23        ,3.556         ,3.857              ,0.922 
0    ,0          ,3712 ,4096 ,23        ,63.78         ,69.22              ,0.921 
0    ,0          ,3744 ,4096 ,23        ,63.149        ,70.832             ,0.892 
0    ,0          ,3776 ,4096 ,23        ,63.619        ,70.826             ,0.898 
0    ,0          ,3808 ,4096 ,23        ,64.318        ,71.604             ,0.898 
0    ,0          ,384  ,128  ,23        ,6.161         ,6.105              ,1.009 
0    ,0          ,384  ,256  ,23        ,9.792         ,9.752              ,1.004 
0    ,0          ,384  ,32   ,23        ,5.498         ,5.014              ,1.097 
0    ,0          ,384  ,512  ,23        ,11.584        ,11.573             ,1.001 
0    ,0          ,384  ,64   ,23        ,4.951         ,5.261              ,0.941 
0    ,0          ,3840 ,4096 ,23        ,65.775        ,70.85              ,0.928 
0    ,0          ,3872 ,4096 ,23        ,66.258        ,72.207             ,0.918 
0    ,0          ,3904 ,4096 ,23        ,66.891        ,72.083             ,0.928 
0    ,0          ,3936 ,4096 ,23        ,66.326        ,73.547             ,0.902 
0    ,0          ,3968 ,4096 ,23        ,67.857        ,73.444             ,0.924 
0    ,0          ,4    ,3    ,0         ,3.591         ,3.785              ,0.949 
0    ,0          ,4    ,3    ,23        ,3.589         ,3.813              ,0.941 
0    ,0          ,4    ,5    ,0         ,3.486         ,3.514              ,0.992 
0    ,0          ,4    ,5    ,23        ,3.483         ,3.58               ,0.973 
0    ,0          ,400  ,16   ,23        ,3.575         ,3.88               ,0.921 
0    ,0          ,4000 ,4096 ,23        ,67.682        ,74.733             ,0.906 
0    ,0          ,4032 ,4096 ,23        ,67.609        ,76.891             ,0.879 
0    ,0          ,4064 ,4096 ,23        ,68.659        ,76.556             ,0.897 
0    ,0          ,4096 ,3712 ,23        ,64.615        ,88.387             ,0.731 
0    ,0          ,4096 ,3744 ,23        ,64.921        ,87.941             ,0.738 
0    ,0          ,4096 ,3776 ,23        ,65.276        ,87.668             ,0.745 
0    ,0          ,4096 ,3808 ,23        ,66.016        ,88.603             ,0.745 
0    ,0          ,4096 ,3840 ,23        ,70.403        ,91.997             ,0.765 
0    ,0          ,4096 ,3872 ,23        ,67.055        ,87.431             ,0.767 
0    ,0          ,4096 ,3904 ,23        ,68.023        ,89.039             ,0.764 
0    ,0          ,4096 ,3936 ,23        ,67.631        ,89.265             ,0.758 
0    ,0          ,4096 ,3968 ,23        ,68.641        ,74.007             ,0.927 
0    ,0          ,4096 ,4000 ,23        ,72.133        ,78.95              ,0.914 
0    ,0          ,4096 ,4032 ,23        ,69.08         ,77.393             ,0.893 
0    ,0          ,4096 ,4064 ,23        ,70.372        ,77.075             ,0.913 
0    ,0          ,4096 ,4096 ,23        ,69.437        ,75.123             ,0.924 
0    ,0          ,4096 ,4128 ,23        ,70.462        ,75.608             ,0.932 
0    ,0          ,4096 ,4160 ,23        ,69.956        ,75.867             ,0.922 
0    ,0          ,4096 ,4192 ,23        ,69.843        ,75.901             ,0.92  
0    ,0          ,4096 ,4224 ,23        ,70.844        ,76.334             ,0.928 
0    ,0          ,4096 ,4256 ,23        ,69.573        ,75.887             ,0.917 
0    ,0          ,4096 ,4288 ,23        ,70.359        ,76.0               ,0.926 
0    ,0          ,4096 ,4320 ,23        ,71.167        ,75.91              ,0.938 
0    ,0          ,4096 ,4352 ,23        ,69.839        ,75.444             ,0.926 
0    ,0          ,4096 ,4384 ,23        ,69.719        ,75.942             ,0.918 
0    ,0          ,4096 ,4416 ,23        ,69.554        ,75.796             ,0.918 
0    ,0          ,4096 ,4448 ,23        ,69.115        ,75.496             ,0.915 
0    ,0          ,4096 ,4480 ,23        ,70.861        ,75.695             ,0.936 
0    ,0          ,4128 ,4096 ,23        ,69.667        ,77.45              ,0.9   
0    ,0          ,416  ,128  ,23        ,6.163         ,6.065              ,1.016 
0    ,0          ,416  ,256  ,23        ,11.565        ,10.811             ,1.07  
0    ,0          ,416  ,32   ,23        ,5.391         ,5.133              ,1.05  
0    ,0          ,416  ,512  ,23        ,11.685        ,10.918             ,1.07  
0    ,0          ,416  ,64   ,23        ,4.987         ,5.125              ,0.973 
0    ,0          ,4160 ,4096 ,23        ,69.348        ,76.459             ,0.907 
0    ,0          ,4192 ,4096 ,23        ,70.619        ,76.057             ,0.929 
0    ,0          ,4224 ,4096 ,23        ,68.959        ,76.303             ,0.904 
0    ,0          ,4256 ,4096 ,23        ,75.085        ,96.41              ,0.779 
0    ,0          ,4288 ,4096 ,23        ,69.921        ,92.693             ,0.754 
0    ,0          ,4320 ,4096 ,23        ,72.347        ,96.461             ,0.75  
0    ,0          ,4352 ,4096 ,23        ,72.83         ,98.647             ,0.738 
0    ,0          ,4384 ,4096 ,23        ,70.59         ,95.961             ,0.736 
0    ,0          ,4416 ,4096 ,23        ,71.088        ,95.826             ,0.742 
0    ,0          ,4448 ,4096 ,23        ,71.876        ,96.575             ,0.744 
0    ,0          ,448  ,128  ,23        ,6.128         ,6.058              ,1.012 
0    ,0          ,448  ,256  ,23        ,10.492        ,10.524             ,0.997 
0    ,0          ,448  ,512  ,23        ,12.444        ,11.774             ,1.057 
0    ,0          ,448  ,64   ,23        ,4.977         ,5.204              ,0.956 
0    ,0          ,4480 ,4096 ,23        ,70.467        ,95.694             ,0.736 
0    ,0          ,48   ,16   ,23        ,3.472         ,3.889              ,0.893 
0    ,0          ,480  ,128  ,23        ,6.185         ,6.002              ,1.031 
0    ,0          ,480  ,256  ,23        ,10.382        ,10.477             ,0.991 
0    ,0          ,480  ,512  ,23        ,12.402        ,12.486             ,0.993 
0    ,0          ,5    ,4    ,0         ,3.578         ,3.777              ,0.947 
0    ,0          ,5    ,4    ,23        ,3.521         ,3.788              ,0.929 
0    ,0          ,5    ,6    ,0         ,3.489         ,3.712              ,0.94  
0    ,0          ,5    ,6    ,23        ,3.476         ,3.727              ,0.933 
0    ,0          ,512  ,128  ,23        ,6.127         ,6.091              ,1.006 
0    ,0          ,512  ,160  ,23        ,9.055         ,9.019              ,1.004 
0    ,0          ,512  ,192  ,23        ,9.408         ,9.58               ,0.982 
0    ,0          ,512  ,224  ,23        ,9.337         ,9.378              ,0.996 
0    ,0          ,512  ,256  ,23        ,10.419        ,10.511             ,0.991 
0    ,0          ,512  ,288  ,23        ,10.862        ,10.885             ,0.998 
0    ,0          ,512  ,320  ,23        ,11.236        ,11.349             ,0.99  
0    ,0          ,512  ,352  ,23        ,12.097        ,11.381             ,1.063 
0    ,0          ,512  ,384  ,23        ,11.787        ,11.561             ,1.02  
0    ,0          ,512  ,416  ,23        ,12.889        ,12.124             ,1.063 
0    ,0          ,512  ,448  ,23        ,13.497        ,13.479             ,1.001 
0    ,0          ,512  ,480  ,23        ,13.987        ,13.836             ,1.011 
0    ,0          ,512  ,512  ,23        ,13.425        ,13.128             ,1.023 
0    ,0          ,512  ,544  ,23        ,13.628        ,13.322             ,1.023 
0    ,0          ,512  ,576  ,23        ,13.629        ,13.332             ,1.022 
0    ,0          ,512  ,608  ,23        ,13.592        ,13.286             ,1.023 
0    ,0          ,512  ,640  ,23        ,13.504        ,13.303             ,1.015 
0    ,0          ,512  ,672  ,23        ,13.641        ,13.31              ,1.025 
0    ,0          ,512  ,704  ,23        ,13.602        ,14.037             ,0.969 
0    ,0          ,512  ,736  ,23        ,13.599        ,13.259             ,1.026 
0    ,0          ,512  ,768  ,23        ,13.556        ,13.218             ,1.026 
0    ,0          ,512  ,800  ,23        ,13.479        ,13.274             ,1.016 
0    ,0          ,512  ,832  ,23        ,13.588        ,13.265             ,1.024 
0    ,0          ,512  ,864  ,23        ,13.552        ,13.265             ,1.022 
0    ,0          ,512  ,896  ,23        ,13.688        ,13.369             ,1.024 
0    ,0          ,544  ,256  ,23        ,10.269        ,10.421             ,0.985 
0    ,0          ,544  ,512  ,23        ,14.301        ,13.686             ,1.045 
0    ,0          ,576  ,256  ,23        ,10.335        ,10.421             ,0.992 
0    ,0          ,576  ,512  ,23        ,14.129        ,13.776             ,1.026 
0    ,0          ,6    ,5    ,0         ,3.6           ,3.781              ,0.952 
0    ,0          ,6    ,5    ,23        ,3.522         ,3.783              ,0.931 
0    ,0          ,6    ,7    ,0         ,3.506         ,3.787              ,0.926 
0    ,0          ,6    ,7    ,23        ,3.505         ,3.811              ,0.92  
0    ,0          ,608  ,256  ,23        ,10.422        ,10.401             ,1.002 
0    ,0          ,608  ,512  ,23        ,14.041        ,13.801             ,1.017 
0    ,0          ,64   ,128  ,23        ,4.606         ,5.534              ,0.832 
0    ,0          ,64   ,160  ,23        ,4.482         ,5.649              ,0.793 
0    ,0          ,64   ,192  ,23        ,4.629         ,5.528              ,0.837 
0    ,0          ,64   ,224  ,23        ,4.516         ,5.489              ,0.823 
0    ,0          ,64   ,256  ,23        ,4.448         ,5.588              ,0.796 
0    ,0          ,64   ,288  ,23        ,4.581         ,5.517              ,0.83  
0    ,0          ,64   ,32   ,23        ,4.755         ,5.667              ,0.839 
0    ,0          ,64   ,320  ,23        ,4.421         ,5.481              ,0.807 
0    ,0          ,64   ,352  ,23        ,4.562         ,5.522              ,0.826 
0    ,0          ,64   ,384  ,23        ,4.467         ,5.49               ,0.814 
0    ,0          ,64   ,416  ,23        ,4.384         ,5.449              ,0.804 
0    ,0          ,64   ,448  ,23        ,4.492         ,5.542              ,0.811 
0    ,0          ,64   ,64   ,23        ,4.373         ,5.382              ,0.812 
0    ,0          ,64   ,96   ,23        ,4.473         ,5.568              ,0.803 
0    ,0          ,640  ,1024 ,23        ,15.477        ,15.286             ,1.012 
0    ,0          ,640  ,256  ,23        ,10.386        ,10.54              ,0.985 
0    ,0          ,640  ,512  ,23        ,13.804        ,13.711             ,1.007 
0    ,0          ,672  ,1024 ,23        ,15.551        ,15.098             ,1.03  
0    ,0          ,672  ,512  ,23        ,14.409        ,14.727             ,0.978 
0    ,0          ,7    ,6    ,0         ,3.658         ,3.773              ,0.969 
0    ,0          ,7    ,6    ,23        ,3.684         ,3.864              ,0.953 
0    ,0          ,7    ,8    ,0         ,3.506         ,3.831              ,0.915 
0    ,0          ,7    ,8    ,23        ,3.498         ,3.796              ,0.921 
0    ,0          ,704  ,1024 ,23        ,16.131        ,15.806             ,1.021 
0    ,0          ,704  ,512  ,23        ,14.531        ,14.761             ,0.984 
0    ,0          ,736  ,1024 ,23        ,16.909        ,16.371             ,1.033 
0    ,0          ,736  ,512  ,23        ,14.332        ,14.728             ,0.973 
0    ,0          ,768  ,1024 ,23        ,17.52         ,17.314             ,1.012 
0    ,0          ,768  ,512  ,23        ,14.487        ,14.744             ,0.983 
0    ,0          ,7808 ,8192 ,23        ,142.838       ,140.594            ,1.016 
0    ,0          ,7840 ,8192 ,23        ,146.234       ,141.352            ,1.035 
0    ,0          ,7872 ,8192 ,23        ,145.796       ,142.548            ,1.023 
0    ,0          ,7904 ,8192 ,23        ,144.219       ,143.683            ,1.004 
0    ,0          ,7936 ,8192 ,23        ,147.803       ,143.665            ,1.029 
0    ,0          ,7968 ,8192 ,23        ,147.458       ,144.457            ,1.021 
0    ,0          ,8    ,7    ,0         ,3.556         ,3.801              ,0.935 
0    ,0          ,8    ,7    ,23        ,3.613         ,3.782              ,0.955 
0    ,0          ,8    ,9    ,0         ,3.5           ,3.811              ,0.918 
0    ,0          ,8    ,9    ,23        ,3.506         ,3.825              ,0.917 
0    ,0          ,80   ,16   ,23        ,3.541         ,3.965              ,0.893 
0    ,0          ,800  ,1024 ,23        ,17.385        ,17.114             ,1.016 
0    ,0          ,800  ,512  ,23        ,14.447        ,14.829             ,0.974 
0    ,0          ,8000 ,8192 ,23        ,147.199       ,144.857            ,1.016 
0    ,0          ,8032 ,8192 ,23        ,148.789       ,145.683            ,1.021 
0    ,0          ,8064 ,8192 ,23        ,149.846       ,145.922            ,1.027 
0    ,0          ,8096 ,8192 ,23        ,150.151       ,145.632            ,1.031 
0    ,0          ,8128 ,8192 ,23        ,149.362       ,146.551            ,1.019 
0    ,0          ,8160 ,8192 ,23        ,149.914       ,149.245            ,1.004 
0    ,0          ,832  ,1024 ,23        ,17.734        ,17.688             ,1.003 
0    ,0          ,832  ,512  ,23        ,14.485        ,14.736             ,0.983 
0    ,0          ,864  ,1024 ,23        ,18.89         ,17.95              ,1.052 
0    ,0          ,864  ,512  ,23        ,15.036        ,15.126             ,0.994 
0    ,0          ,896  ,1024 ,23        ,19.813        ,18.7               ,1.06  
0    ,0          ,896  ,512  ,23        ,14.523        ,14.808             ,0.981 
0    ,0          ,9    ,10   ,0         ,3.498         ,3.818              ,0.916 
0    ,0          ,9    ,10   ,23        ,3.519         ,3.792              ,0.928 
0    ,0          ,9    ,8    ,0         ,3.637         ,3.787              ,0.96  
0    ,0          ,9    ,8    ,23        ,3.571         ,3.784              ,0.944 
0    ,0          ,928  ,1024 ,23        ,19.587        ,18.73              ,1.046 
0    ,0          ,96   ,128  ,23        ,5.024         ,6.657              ,0.755 
0    ,0          ,96   ,256  ,23        ,5.063         ,6.472              ,0.782 
0    ,0          ,96   ,32   ,23        ,4.998         ,5.735              ,0.871 
0    ,0          ,96   ,64   ,23        ,5.6           ,5.634              ,0.994 
0    ,0          ,960  ,1024 ,23        ,19.758        ,19.474             ,1.015 
0    ,0          ,992  ,1024 ,23        ,21.526        ,19.571             ,1.1   
1    ,0          ,0    ,1    ,0         ,3.321         ,3.989              ,0.832 
1    ,0          ,0    ,1    ,23        ,3.381         ,4.061              ,0.833 
1    ,0          ,192  ,32   ,0         ,4.672         ,5.119              ,0.913 
1    ,0          ,192  ,32   ,23        ,4.516         ,4.979              ,0.907 
1    ,0          ,2    ,1    ,0         ,3.525         ,3.521              ,1.001 
1    ,0          ,2    ,1    ,23        ,3.608         ,3.668              ,0.984 
1    ,0          ,256  ,32   ,0         ,4.58          ,5.029              ,0.911 
1    ,0          ,256  ,32   ,23        ,4.569         ,5.008              ,0.912 
1    ,0          ,256  ,64   ,0         ,5.933         ,5.39               ,1.101 
1    ,0          ,256  ,64   ,23        ,5.057         ,5.365              ,0.943 
1    ,0          ,512  ,32   ,0         ,4.63          ,4.965              ,0.933 
1    ,0          ,512  ,32   ,23        ,4.581         ,5.087              ,0.901 
10   ,0          ,11   ,10   ,0         ,3.57          ,3.81               ,0.937 
10   ,0          ,11   ,10   ,23        ,3.59          ,3.816              ,0.941 
10   ,0          ,9    ,10   ,0         ,3.51          ,3.84               ,0.914 
10   ,0          ,9    ,10   ,23        ,3.506         ,3.818              ,0.918 
11   ,0          ,10   ,11   ,0         ,3.508         ,3.829              ,0.916 
11   ,0          ,10   ,11   ,23        ,3.5           ,3.952              ,0.886 
11   ,0          ,12   ,11   ,0         ,3.62          ,3.813              ,0.949 
11   ,0          ,12   ,11   ,23        ,3.595         ,3.816              ,0.942 
12   ,0          ,11   ,12   ,0         ,3.508         ,3.828              ,0.916 
12   ,0          ,11   ,12   ,23        ,3.509         ,3.823              ,0.918 
12   ,0          ,13   ,12   ,0         ,3.622         ,3.798              ,0.954 
12   ,0          ,13   ,12   ,23        ,3.567         ,3.835              ,0.93  
13   ,0          ,12   ,13   ,0         ,3.51          ,3.797              ,0.924 
13   ,0          ,12   ,13   ,23        ,3.485         ,3.778              ,0.922 
13   ,0          ,14   ,13   ,0         ,3.625         ,3.84               ,0.944 
13   ,0          ,14   ,13   ,23        ,3.594         ,3.842              ,0.935 
14   ,0          ,13   ,14   ,0         ,3.473         ,3.829              ,0.907 
14   ,0          ,13   ,14   ,23        ,3.5           ,3.846              ,0.91  
14   ,0          ,15   ,14   ,0         ,3.691         ,3.795              ,0.973 
14   ,0          ,15   ,14   ,23        ,3.537         ,3.828              ,0.924 
15   ,0          ,14   ,15   ,0         ,3.489         ,3.83               ,0.911 
15   ,0          ,14   ,15   ,23        ,3.495         ,3.793              ,0.921 
15   ,0          ,16   ,15   ,0         ,3.607         ,3.775              ,0.956 
15   ,0          ,16   ,15   ,23        ,3.619         ,3.883              ,0.932 
16   ,0          ,15   ,16   ,0         ,3.518         ,3.852              ,0.913 
16   ,0          ,15   ,16   ,23        ,3.492         ,3.772              ,0.926 
16   ,0          ,17   ,16   ,0         ,3.624         ,3.859              ,0.939 
16   ,0          ,17   ,16   ,23        ,3.634         ,3.817              ,0.952 
17   ,0          ,16   ,17   ,0         ,3.485         ,3.89               ,0.896 
17   ,0          ,16   ,17   ,23        ,3.498         ,3.836              ,0.912 
17   ,0          ,18   ,17   ,0         ,3.583         ,3.816              ,0.939 
17   ,0          ,18   ,17   ,23        ,3.595         ,3.818              ,0.942 
18   ,0          ,17   ,18   ,0         ,3.468         ,3.839              ,0.903 
18   ,0          ,17   ,18   ,23        ,3.493         ,3.805              ,0.918 
18   ,0          ,19   ,18   ,0         ,3.593         ,3.805              ,0.944 
18   ,0          ,19   ,18   ,23        ,3.585         ,3.776              ,0.949 
19   ,0          ,18   ,19   ,0         ,3.474         ,3.818              ,0.91  
19   ,0          ,18   ,19   ,23        ,3.474         ,3.832              ,0.907 
19   ,0          ,20   ,19   ,0         ,3.576         ,3.849              ,0.929 
19   ,0          ,20   ,19   ,23        ,3.502         ,3.873              ,0.904 
2    ,0          ,1    ,2    ,0         ,3.515         ,3.515              ,1.0   
2    ,0          ,1    ,2    ,23        ,3.506         ,3.504              ,1.0   
2    ,0          ,192  ,64   ,0         ,5.019         ,5.348              ,0.938 
2    ,0          ,192  ,64   ,23        ,5.265         ,5.433              ,0.969 
2    ,0          ,256  ,64   ,0         ,5.028         ,5.155              ,0.975 
2    ,0          ,256  ,64   ,23        ,4.967         ,5.161              ,0.962 
2    ,0          ,3    ,2    ,0         ,3.603         ,3.78               ,0.953 
2    ,0          ,3    ,2    ,23        ,3.568         ,3.829              ,0.932 
2    ,0          ,512  ,64   ,0         ,4.982         ,5.124              ,0.972 
2    ,0          ,512  ,64   ,23        ,4.963         ,5.239              ,0.947 
20   ,0          ,19   ,20   ,0         ,3.446         ,3.791              ,0.909 
20   ,0          ,19   ,20   ,23        ,3.475         ,3.819              ,0.91  
20   ,0          ,21   ,20   ,0         ,3.601         ,3.776              ,0.954 
20   ,0          ,21   ,20   ,23        ,3.599         ,3.798              ,0.948 
2048 ,0          ,0    ,1    ,0         ,3.429         ,4.112              ,0.834 
2048 ,0          ,0    ,1    ,23        ,3.455         ,4.144              ,0.834 
2048 ,0          ,1    ,2    ,0         ,3.525         ,3.505              ,1.006 
2048 ,0          ,1    ,2    ,23        ,3.498         ,3.496              ,1.001 
2048 ,0          ,10   ,11   ,0         ,3.5           ,3.931              ,0.89  
2048 ,0          ,10   ,11   ,23        ,3.542         ,3.848              ,0.92  
2048 ,0          ,10   ,9    ,0         ,3.588         ,3.819              ,0.94  
2048 ,0          ,10   ,9    ,23        ,3.595         ,3.836              ,0.937 
2048 ,0          ,11   ,10   ,0         ,3.626         ,3.785              ,0.958 
2048 ,0          ,11   ,10   ,23        ,3.622         ,3.816              ,0.949 
2048 ,0          ,11   ,12   ,0         ,3.491         ,3.826              ,0.912 
2048 ,0          ,11   ,12   ,23        ,3.49          ,3.804              ,0.917 
2048 ,0          ,12   ,11   ,0         ,3.556         ,3.774              ,0.942 
2048 ,0          ,12   ,11   ,23        ,3.678         ,3.986              ,0.923 
2048 ,0          ,12   ,13   ,0         ,3.494         ,3.835              ,0.911 
2048 ,0          ,12   ,13   ,23        ,3.481         ,3.829              ,0.909 
2048 ,0          ,13   ,12   ,0         ,3.632         ,3.888              ,0.934 
2048 ,0          ,13   ,12   ,23        ,3.614         ,3.824              ,0.945 
2048 ,0          ,13   ,14   ,0         ,3.497         ,3.888              ,0.9   
2048 ,0          ,13   ,14   ,23        ,3.506         ,3.833              ,0.915 
2048 ,0          ,14   ,13   ,0         ,3.568         ,3.792              ,0.941 
2048 ,0          ,14   ,13   ,23        ,3.563         ,3.829              ,0.931 
2048 ,0          ,14   ,15   ,0         ,3.482         ,3.809              ,0.914 
2048 ,0          ,14   ,15   ,23        ,3.471         ,3.792              ,0.915 
2048 ,0          ,15   ,14   ,0         ,3.598         ,3.813              ,0.944 
2048 ,0          ,15   ,14   ,23        ,3.576         ,3.868              ,0.925 
2048 ,0          ,15   ,16   ,0         ,3.506         ,3.915              ,0.896 
2048 ,0          ,15   ,16   ,23        ,3.494         ,3.827              ,0.913 
2048 ,0          ,16   ,15   ,0         ,3.564         ,3.857              ,0.924 
2048 ,0          ,16   ,15   ,23        ,3.578         ,3.789              ,0.944 
2048 ,0          ,16   ,17   ,0         ,3.487         ,3.826              ,0.911 
2048 ,0          ,16   ,17   ,23        ,3.472         ,3.789              ,0.916 
2048 ,0          ,17   ,16   ,0         ,3.572         ,3.859              ,0.925 
2048 ,0          ,17   ,16   ,23        ,3.64          ,3.797              ,0.959 
2048 ,0          ,17   ,18   ,0         ,3.485         ,3.808              ,0.915 
2048 ,0          ,17   ,18   ,23        ,3.471         ,3.896              ,0.891 
2048 ,0          ,18   ,17   ,0         ,3.585         ,3.802              ,0.943 
2048 ,0          ,18   ,17   ,23        ,3.578         ,3.834              ,0.933 
2048 ,0          ,18   ,19   ,0         ,3.5           ,3.797              ,0.922 
2048 ,0          ,18   ,19   ,23        ,3.468         ,3.798              ,0.913 
2048 ,0          ,19   ,18   ,0         ,3.595         ,3.893              ,0.923 
2048 ,0          ,19   ,18   ,23        ,3.588         ,3.862              ,0.929 
2048 ,0          ,19   ,20   ,0         ,3.455         ,3.908              ,0.884 
2048 ,0          ,19   ,20   ,23        ,3.465         ,3.801              ,0.911 
2048 ,0          ,2    ,1    ,0         ,3.461         ,3.542              ,0.977 
2048 ,0          ,2    ,1    ,23        ,3.27          ,3.298              ,0.992 
2048 ,0          ,2    ,3    ,0         ,3.686         ,3.71               ,0.994 
2048 ,0          ,2    ,3    ,23        ,3.681         ,3.836              ,0.959 
2048 ,0          ,20   ,19   ,0         ,3.601         ,3.756              ,0.959 
2048 ,0          ,20   ,19   ,23        ,3.586         ,3.85               ,0.932 
2048 ,0          ,20   ,21   ,0         ,3.448         ,3.753              ,0.919 
2048 ,0          ,20   ,21   ,23        ,3.496         ,3.85               ,0.908 
2048 ,0          ,21   ,20   ,0         ,3.632         ,3.848              ,0.944 
2048 ,0          ,21   ,20   ,23        ,3.599         ,3.813              ,0.944 
2048 ,0          ,21   ,22   ,0         ,3.45          ,3.763              ,0.917 
2048 ,0          ,21   ,22   ,23        ,3.436         ,3.82               ,0.899 
2048 ,0          ,22   ,21   ,0         ,3.575         ,3.914              ,0.914 
2048 ,0          ,22   ,21   ,23        ,3.574         ,3.793              ,0.942 
2048 ,0          ,22   ,23   ,0         ,3.442         ,3.759              ,0.916 
2048 ,0          ,22   ,23   ,23        ,3.437         ,3.802              ,0.904 
2048 ,0          ,23   ,22   ,0         ,3.553         ,3.789              ,0.938 
2048 ,0          ,23   ,22   ,23        ,3.571         ,3.739              ,0.955 
2048 ,0          ,23   ,24   ,0         ,3.429         ,3.78               ,0.907 
2048 ,0          ,23   ,24   ,23        ,3.467         ,3.739              ,0.927 
2048 ,0          ,24   ,23   ,0         ,3.566         ,3.821              ,0.933 
2048 ,0          ,24   ,23   ,23        ,3.536         ,3.759              ,0.941 
2048 ,0          ,24   ,25   ,0         ,3.429         ,3.718              ,0.922 
2048 ,0          ,24   ,25   ,23        ,3.431         ,3.794              ,0.904 
2048 ,0          ,25   ,24   ,0         ,3.521         ,3.735              ,0.943 
2048 ,0          ,25   ,24   ,23        ,3.557         ,3.713              ,0.958 
2048 ,0          ,25   ,26   ,0         ,3.389         ,3.764              ,0.901 
2048 ,0          ,25   ,26   ,23        ,3.369         ,3.712              ,0.908 
2048 ,0          ,26   ,25   ,0         ,3.511         ,3.82               ,0.919 
2048 ,0          ,26   ,25   ,23        ,3.524         ,3.81               ,0.925 
2048 ,0          ,26   ,27   ,0         ,3.399         ,3.767              ,0.902 
2048 ,0          ,26   ,27   ,23        ,3.411         ,3.733              ,0.914 
2048 ,0          ,27   ,26   ,0         ,3.511         ,3.742              ,0.938 
2048 ,0          ,27   ,26   ,23        ,3.526         ,3.733              ,0.945 
2048 ,0          ,27   ,28   ,0         ,3.358         ,3.709              ,0.905 
2048 ,0          ,27   ,28   ,23        ,3.408         ,3.735              ,0.912 
2048 ,0          ,28   ,27   ,0         ,3.508         ,3.733              ,0.94  
2048 ,0          ,28   ,27   ,23        ,3.467         ,3.686              ,0.941 
2048 ,0          ,28   ,29   ,0         ,3.335         ,3.699              ,0.902 
2048 ,0          ,28   ,29   ,23        ,3.363         ,3.675              ,0.915 
2048 ,0          ,29   ,28   ,0         ,3.561         ,3.72               ,0.957 
2048 ,0          ,29   ,28   ,23        ,3.501         ,3.707              ,0.944 
2048 ,0          ,29   ,30   ,0         ,3.348         ,3.734              ,0.897 
2048 ,0          ,29   ,30   ,23        ,3.336         ,3.767              ,0.886 
2048 ,0          ,3    ,2    ,0         ,3.627         ,3.8                ,0.954 
2048 ,0          ,3    ,2    ,23        ,3.632         ,3.831              ,0.948 
2048 ,0          ,3    ,4    ,0         ,3.501         ,3.491              ,1.003 
2048 ,0          ,3    ,4    ,23        ,3.498         ,3.652              ,0.958 
2048 ,0          ,30   ,29   ,0         ,3.528         ,3.794              ,0.93  
2048 ,0          ,30   ,29   ,23        ,3.47          ,3.666              ,0.947 
2048 ,0          ,30   ,31   ,0         ,3.355         ,3.752              ,0.894 
2048 ,0          ,30   ,31   ,23        ,3.316         ,3.671              ,0.903 
2048 ,0          ,31   ,30   ,0         ,3.429         ,3.679              ,0.932 
2048 ,0          ,31   ,30   ,23        ,3.441         ,3.724              ,0.924 
2048 ,0          ,32   ,31   ,0         ,3.367         ,3.671              ,0.917 
2048 ,0          ,32   ,31   ,23        ,3.416         ,3.708              ,0.921 
2048 ,0          ,4    ,3    ,0         ,3.699         ,3.977              ,0.93  
2048 ,0          ,4    ,3    ,23        ,3.832         ,3.977              ,0.964 
2048 ,0          ,4    ,5    ,0         ,3.527         ,3.549              ,0.994 
2048 ,0          ,4    ,5    ,23        ,3.489         ,3.567              ,0.978 
2048 ,0          ,5    ,4    ,0         ,3.657         ,3.842              ,0.952 
2048 ,0          ,5    ,4    ,23        ,3.655         ,3.789              ,0.965 
2048 ,0          ,5    ,6    ,0         ,3.51          ,3.778              ,0.929 
2048 ,0          ,5    ,6    ,23        ,3.498         ,3.794              ,0.922 
2048 ,0          ,6    ,5    ,0         ,3.601         ,3.798              ,0.948 
2048 ,0          ,6    ,5    ,23        ,3.637         ,3.846              ,0.946 
2048 ,0          ,6    ,7    ,0         ,3.48          ,3.741              ,0.93  
2048 ,0          ,6    ,7    ,23        ,3.489         ,3.804              ,0.917 
2048 ,0          ,7    ,6    ,0         ,3.613         ,3.817              ,0.947 
2048 ,0          ,7    ,6    ,23        ,3.6           ,3.783              ,0.952 
2048 ,0          ,7    ,8    ,0         ,3.48          ,3.816              ,0.912 
2048 ,0          ,7    ,8    ,23        ,3.498         ,3.743              ,0.934 
2048 ,0          ,8    ,7    ,0         ,3.599         ,3.791              ,0.95  
2048 ,0          ,8    ,7    ,23        ,3.616         ,3.859              ,0.937 
2048 ,0          ,8    ,9    ,0         ,3.509         ,3.791              ,0.925 
2048 ,0          ,8    ,9    ,23        ,3.501         ,3.801              ,0.921 
2048 ,0          ,9    ,10   ,0         ,3.509         ,3.841              ,0.913 
2048 ,0          ,9    ,10   ,23        ,3.507         ,3.804              ,0.922 
2048 ,0          ,9    ,8    ,0         ,3.583         ,3.771              ,0.95  
2048 ,0          ,9    ,8    ,23        ,3.551         ,3.844              ,0.924 
2049 ,0          ,0    ,1    ,0         ,3.316         ,3.994              ,0.83  
2049 ,0          ,0    ,1    ,23        ,3.378         ,4.055              ,0.833 
2049 ,0          ,2    ,1    ,0         ,3.498         ,3.602              ,0.971 
2049 ,0          ,2    ,1    ,23        ,3.502         ,3.565              ,0.982 
2050 ,0          ,1    ,2    ,0         ,3.533         ,3.531              ,1.001 
2050 ,0          ,1    ,2    ,23        ,3.513         ,3.504              ,1.002 
2050 ,0          ,3    ,2    ,0         ,3.628         ,3.894              ,0.932 
2050 ,0          ,3    ,2    ,23        ,3.579         ,3.836              ,0.933 
2051 ,0          ,2    ,3    ,0         ,3.697         ,3.771              ,0.98  
2051 ,0          ,2    ,3    ,23        ,3.696         ,3.738              ,0.989 
2051 ,0          ,4    ,3    ,0         ,3.751         ,3.969              ,0.945 
2051 ,0          ,4    ,3    ,23        ,3.713         ,3.979              ,0.933 
2052 ,0          ,3    ,4    ,0         ,3.498         ,3.544              ,0.987 
2052 ,0          ,3    ,4    ,23        ,3.521         ,3.513              ,1.002 
2052 ,0          ,5    ,4    ,0         ,3.575         ,3.824              ,0.935 
2052 ,0          ,5    ,4    ,23        ,3.598         ,3.877              ,0.928 
2053 ,0          ,4    ,5    ,0         ,3.506         ,3.592              ,0.976 
2053 ,0          ,4    ,5    ,23        ,3.509         ,3.525              ,0.996 
2053 ,0          ,6    ,5    ,0         ,3.558         ,3.881              ,0.917 
2053 ,0          ,6    ,5    ,23        ,3.597         ,3.853              ,0.933 
2054 ,0          ,5    ,6    ,0         ,3.503         ,3.807              ,0.92  
2054 ,0          ,5    ,6    ,23        ,3.515         ,3.827              ,0.919 
2054 ,0          ,7    ,6    ,0         ,3.535         ,3.793              ,0.932 
2054 ,0          ,7    ,6    ,23        ,3.572         ,3.796              ,0.941 
2055 ,0          ,6    ,7    ,0         ,3.492         ,3.691              ,0.946 
2055 ,0          ,6    ,7    ,23        ,3.489         ,3.717              ,0.939 
2055 ,0          ,8    ,7    ,0         ,3.604         ,3.792              ,0.95  
2055 ,0          ,8    ,7    ,23        ,3.542         ,3.784              ,0.936 
2056 ,0          ,7    ,8    ,0         ,3.507         ,3.861              ,0.908 
2056 ,0          ,7    ,8    ,23        ,3.501         ,3.825              ,0.915 
2056 ,0          ,9    ,8    ,0         ,3.599         ,3.792              ,0.949 
2056 ,0          ,9    ,8    ,23        ,3.585         ,3.818              ,0.939 
2057 ,0          ,10   ,9    ,0         ,3.607         ,3.816              ,0.945 
2057 ,0          ,10   ,9    ,23        ,3.652         ,3.814              ,0.958 
2057 ,0          ,8    ,9    ,0         ,3.515         ,3.827              ,0.918 
2057 ,0          ,8    ,9    ,23        ,3.506         ,3.808              ,0.921 
2058 ,0          ,11   ,10   ,0         ,3.593         ,3.806              ,0.944 
2058 ,0          ,11   ,10   ,23        ,3.623         ,3.845              ,0.942 
2058 ,0          ,9    ,10   ,0         ,3.506         ,3.844              ,0.912 
2058 ,0          ,9    ,10   ,23        ,3.498         ,3.819              ,0.916 
2059 ,0          ,10   ,11   ,0         ,3.506         ,3.862              ,0.908 
2059 ,0          ,10   ,11   ,23        ,3.509         ,3.794              ,0.925 
2059 ,0          ,12   ,11   ,0         ,3.567         ,3.855              ,0.925 
2059 ,0          ,12   ,11   ,23        ,3.595         ,3.8                ,0.946 
2060 ,0          ,11   ,12   ,0         ,3.509         ,3.87               ,0.907 
2060 ,0          ,11   ,12   ,23        ,3.494         ,3.773              ,0.926 
2060 ,0          ,13   ,12   ,0         ,3.537         ,3.78               ,0.936 
2060 ,0          ,13   ,12   ,23        ,3.631         ,3.839              ,0.946 
2061 ,0          ,12   ,13   ,0         ,3.509         ,3.854              ,0.91  
2061 ,0          ,12   ,13   ,23        ,3.491         ,3.815              ,0.915 
2061 ,0          ,14   ,13   ,0         ,3.572         ,3.838              ,0.931 
2061 ,0          ,14   ,13   ,23        ,3.588         ,3.796              ,0.945 
2062 ,0          ,13   ,14   ,0         ,3.497         ,3.839              ,0.911 
2062 ,0          ,13   ,14   ,23        ,3.481         ,3.809              ,0.914 
2062 ,0          ,15   ,14   ,0         ,3.621         ,3.802              ,0.952 
2062 ,0          ,15   ,14   ,23        ,3.549         ,3.869              ,0.917 
2063 ,0          ,14   ,15   ,0         ,3.489         ,3.825              ,0.912 
2063 ,0          ,14   ,15   ,23        ,3.478         ,3.78               ,0.92  
2063 ,0          ,16   ,15   ,0         ,3.571         ,3.823              ,0.934 
2063 ,0          ,16   ,15   ,23        ,3.58          ,3.827              ,0.935 
2064 ,0          ,15   ,16   ,0         ,3.489         ,3.846              ,0.907 
2064 ,0          ,15   ,16   ,23        ,3.486         ,3.827              ,0.911 
2064 ,0          ,17   ,16   ,0         ,3.567         ,3.811              ,0.936 
2064 ,0          ,17   ,16   ,23        ,3.638         ,3.83               ,0.95  
2065 ,0          ,16   ,17   ,0         ,3.482         ,3.772              ,0.923 
2065 ,0          ,16   ,17   ,23        ,3.498         ,3.841              ,0.911 
2065 ,0          ,18   ,17   ,0         ,3.559         ,3.807              ,0.935 
2065 ,0          ,18   ,17   ,23        ,3.62          ,3.731              ,0.97  
2066 ,0          ,17   ,18   ,0         ,3.476         ,3.809              ,0.913 
2066 ,0          ,17   ,18   ,23        ,3.467         ,3.843              ,0.902 
2066 ,0          ,19   ,18   ,0         ,3.58          ,3.806              ,0.941 
2066 ,0          ,19   ,18   ,23        ,3.577         ,3.915              ,0.914 
2067 ,0          ,18   ,19   ,0         ,3.485         ,3.828              ,0.91  
2067 ,0          ,18   ,19   ,23        ,3.471         ,3.831              ,0.906 
2067 ,0          ,20   ,19   ,0         ,3.611         ,3.848              ,0.938 
2067 ,0          ,20   ,19   ,23        ,3.582         ,3.855              ,0.929 
2068 ,0          ,19   ,20   ,0         ,3.449         ,3.739              ,0.922 
2068 ,0          ,19   ,20   ,23        ,3.463         ,3.827              ,0.905 
2068 ,0          ,21   ,20   ,0         ,3.669         ,3.824              ,0.959 
2068 ,0          ,21   ,20   ,23        ,3.6           ,3.845              ,0.936 
2069 ,0          ,20   ,21   ,0         ,3.441         ,3.802              ,0.905 
2069 ,0          ,20   ,21   ,23        ,3.463         ,3.735              ,0.927 
2069 ,0          ,22   ,21   ,0         ,3.609         ,3.768              ,0.958 
2069 ,0          ,22   ,21   ,23        ,3.605         ,3.769              ,0.956 
2070 ,0          ,21   ,22   ,0         ,3.431         ,3.815              ,0.899 
2070 ,0          ,21   ,22   ,23        ,3.452         ,3.81               ,0.906 
2070 ,0          ,23   ,22   ,0         ,3.563         ,3.811              ,0.935 
2070 ,0          ,23   ,22   ,23        ,3.53          ,3.85               ,0.917 
2071 ,0          ,22   ,23   ,0         ,3.439         ,3.837              ,0.896 
2071 ,0          ,22   ,23   ,23        ,3.421         ,3.778              ,0.905 
2071 ,0          ,24   ,23   ,0         ,3.552         ,3.746              ,0.948 
2071 ,0          ,24   ,23   ,23        ,3.545         ,3.805              ,0.932 
2072 ,0          ,23   ,24   ,0         ,3.431         ,3.788              ,0.906 
2072 ,0          ,23   ,24   ,23        ,3.444         ,3.789              ,0.909 
2072 ,0          ,25   ,24   ,0         ,3.553         ,3.781              ,0.94  
2072 ,0          ,25   ,24   ,23        ,3.563         ,3.74               ,0.953 
2073 ,0          ,24   ,25   ,0         ,3.421         ,3.688              ,0.928 
2073 ,0          ,24   ,25   ,23        ,3.425         ,3.833              ,0.893 
2073 ,0          ,26   ,25   ,0         ,3.56          ,3.765              ,0.945 
2073 ,0          ,26   ,25   ,23        ,3.549         ,3.758              ,0.945 
2074 ,0          ,25   ,26   ,0         ,3.4           ,3.743              ,0.908 
2074 ,0          ,25   ,26   ,23        ,3.39          ,3.725              ,0.91  
2074 ,0          ,27   ,26   ,0         ,3.509         ,3.807              ,0.922 
2074 ,0          ,27   ,26   ,23        ,3.514         ,3.791              ,0.927 
2075 ,0          ,26   ,27   ,0         ,3.395         ,3.765              ,0.902 
2075 ,0          ,26   ,27   ,23        ,3.391         ,3.75               ,0.904 
2075 ,0          ,28   ,27   ,0         ,3.538         ,3.772              ,0.938 
2075 ,0          ,28   ,27   ,23        ,3.504         ,3.705              ,0.946 
2076 ,0          ,27   ,28   ,0         ,3.368         ,3.689              ,0.913 
2076 ,0          ,27   ,28   ,23        ,3.358         ,3.732              ,0.9   
2076 ,0          ,29   ,28   ,0         ,3.523         ,3.723              ,0.946 
2076 ,0          ,29   ,28   ,23        ,3.443         ,3.752              ,0.917 
2077 ,0          ,28   ,29   ,0         ,3.356         ,3.711              ,0.904 
2077 ,0          ,28   ,29   ,23        ,3.348         ,3.684              ,0.909 
2077 ,0          ,30   ,29   ,0         ,3.5           ,3.68               ,0.951 
2077 ,0          ,30   ,29   ,23        ,3.4           ,3.711              ,0.916 
2078 ,0          ,29   ,30   ,0         ,3.368         ,3.697              ,0.911 
2078 ,0          ,29   ,30   ,23        ,3.348         ,3.652              ,0.917 
2078 ,0          ,31   ,30   ,0         ,3.455         ,3.781              ,0.914 
2078 ,0          ,31   ,30   ,23        ,3.461         ,3.735              ,0.927 
2079 ,0          ,30   ,31   ,0         ,3.372         ,3.816              ,0.884 
2079 ,0          ,30   ,31   ,23        ,3.357         ,3.692              ,0.909 
2079 ,0          ,32   ,31   ,0         ,3.358         ,3.741              ,0.898 
2079 ,0          ,32   ,31   ,23        ,3.386         ,3.702              ,0.915 
21   ,0          ,20   ,21   ,0         ,3.485         ,3.842              ,0.907 
21   ,0          ,20   ,21   ,23        ,3.469         ,3.829              ,0.906 
21   ,0          ,22   ,21   ,0         ,3.541         ,3.756              ,0.943 
21   ,0          ,22   ,21   ,23        ,3.586         ,3.787              ,0.947 
22   ,0          ,21   ,22   ,0         ,3.438         ,3.813              ,0.902 
22   ,0          ,21   ,22   ,23        ,3.44          ,3.788              ,0.908 
22   ,0          ,23   ,22   ,0         ,3.602         ,3.905              ,0.922 
22   ,0          ,23   ,22   ,23        ,3.604         ,3.83               ,0.941 
23   ,0          ,22   ,23   ,0         ,3.396         ,3.736              ,0.909 
23   ,0          ,22   ,23   ,23        ,3.386         ,3.856              ,0.878 
23   ,0          ,24   ,23   ,0         ,3.589         ,3.853              ,0.932 
23   ,0          ,24   ,23   ,23        ,3.528         ,3.816              ,0.925 
24   ,0          ,23   ,24   ,0         ,3.414         ,3.688              ,0.926 
24   ,0          ,23   ,24   ,23        ,3.402         ,3.768              ,0.903 
24   ,0          ,25   ,24   ,0         ,3.524         ,3.701              ,0.952 
24   ,0          ,25   ,24   ,23        ,3.486         ,3.738              ,0.933 
25   ,0          ,24   ,25   ,0         ,3.383         ,3.755              ,0.901 
25   ,0          ,24   ,25   ,23        ,3.382         ,3.766              ,0.898 
25   ,0          ,26   ,25   ,0         ,3.51          ,3.789              ,0.926 
25   ,0          ,26   ,25   ,23        ,3.475         ,3.735              ,0.93  
26   ,0          ,25   ,26   ,0         ,3.367         ,3.8                ,0.886 
26   ,0          ,25   ,26   ,23        ,3.364         ,3.732              ,0.901 
26   ,0          ,27   ,26   ,0         ,3.544         ,3.664              ,0.967 
26   ,0          ,27   ,26   ,23        ,3.487         ,3.706              ,0.941 
27   ,0          ,26   ,27   ,0         ,3.358         ,3.683              ,0.912 
27   ,0          ,26   ,27   ,23        ,3.33          ,3.736              ,0.891 
27   ,0          ,28   ,27   ,0         ,3.488         ,3.666              ,0.951 
27   ,0          ,28   ,27   ,23        ,3.479         ,3.707              ,0.938 
28   ,0          ,27   ,28   ,0         ,3.367         ,3.826              ,0.88  
28   ,0          ,27   ,28   ,23        ,3.323         ,3.709              ,0.896 
28   ,0          ,29   ,28   ,0         ,3.468         ,3.704              ,0.936 
28   ,0          ,29   ,28   ,23        ,3.537         ,3.804              ,0.93  
29   ,0          ,28   ,29   ,0         ,3.322         ,3.699              ,0.898 
29   ,0          ,28   ,29   ,23        ,3.291         ,3.701              ,0.889 
29   ,0          ,30   ,29   ,0         ,3.451         ,3.715              ,0.929 
29   ,0          ,30   ,29   ,23        ,3.412         ,3.674              ,0.929 
3    ,0          ,192  ,96   ,0         ,5.844         ,5.713              ,1.023 
3    ,0          ,192  ,96   ,23        ,5.792         ,5.688              ,1.018 
3    ,0          ,2    ,3    ,0         ,3.699         ,3.756              ,0.985 
3    ,0          ,2    ,3    ,23        ,3.686         ,3.753              ,0.982 
3    ,0          ,256  ,64   ,0         ,4.998         ,5.242              ,0.953 
3    ,0          ,256  ,64   ,23        ,4.987         ,5.224              ,0.955 
3    ,0          ,256  ,96   ,0         ,5.846         ,5.735              ,1.019 
3    ,0          ,256  ,96   ,23        ,5.809         ,5.795              ,1.003 
3    ,0          ,4    ,3    ,0         ,3.619         ,3.823              ,0.947 
3    ,0          ,4    ,3    ,23        ,3.644         ,3.798              ,0.96  
3    ,0          ,512  ,96   ,0         ,5.684         ,5.685              ,1.0   
3    ,0          ,512  ,96   ,23        ,5.781         ,5.718              ,1.011 
30   ,0          ,29   ,30   ,0         ,3.332         ,3.682              ,0.905 
30   ,0          ,29   ,30   ,23        ,3.327         ,3.688              ,0.902 
30   ,0          ,31   ,30   ,0         ,3.403         ,3.732              ,0.912 
30   ,0          ,31   ,30   ,23        ,3.406         ,3.778              ,0.902 
31   ,0          ,30   ,31   ,0         ,3.358         ,3.665              ,0.916 
31   ,0          ,30   ,31   ,23        ,3.334         ,3.663              ,0.91  
31   ,0          ,32   ,31   ,0         ,3.381         ,3.712              ,0.911 
31   ,0          ,32   ,31   ,23        ,3.506         ,3.837              ,0.914 
4    ,0          ,192  ,128  ,0         ,6.737         ,6.179              ,1.09  
4    ,0          ,192  ,128  ,23        ,6.341         ,6.195              ,1.024 
4    ,0          ,256  ,128  ,0         ,6.751         ,6.094              ,1.108 
4    ,0          ,256  ,128  ,23        ,6.153         ,6.145              ,1.001 
4    ,0          ,256  ,64   ,0         ,5.052         ,5.33               ,0.948 
4    ,0          ,256  ,64   ,23        ,5.043         ,5.31               ,0.95  
4    ,0          ,3    ,4    ,0         ,3.515         ,3.542              ,0.992 
4    ,0          ,3    ,4    ,23        ,3.508         ,3.531              ,0.993 
4    ,0          ,5    ,4    ,0         ,3.548         ,3.767              ,0.942 
4    ,0          ,5    ,4    ,23        ,3.543         ,3.752              ,0.944 
4    ,0          ,512  ,128  ,0         ,6.143         ,6.093              ,1.008 
4    ,0          ,512  ,128  ,23        ,6.715         ,6.042              ,1.111 
4081 ,0          ,0    ,1    ,0         ,3.262         ,3.912              ,0.834 
4081 ,0          ,0    ,1    ,23        ,3.27          ,3.921              ,0.834 
4081 ,0          ,1    ,2    ,0         ,5.01          ,5.101              ,0.982 
4081 ,0          ,1    ,2    ,23        ,5.01          ,5.061              ,0.99  
4081 ,0          ,10   ,11   ,0         ,4.959         ,5.291              ,0.937 
4081 ,0          ,10   ,11   ,23        ,4.966         ,5.312              ,0.935 
4081 ,0          ,10   ,9    ,0         ,4.317         ,5.319              ,0.812 
4081 ,0          ,10   ,9    ,23        ,4.32          ,5.257              ,0.822 
4081 ,0          ,11   ,10   ,0         ,4.314         ,5.287              ,0.816 
4081 ,0          ,11   ,10   ,23        ,4.325         ,5.268              ,0.821 
4081 ,0          ,11   ,12   ,0         ,4.94          ,5.302              ,0.932 
4081 ,0          ,11   ,12   ,23        ,4.96          ,5.291              ,0.937 
4081 ,0          ,12   ,11   ,0         ,4.379         ,5.237              ,0.836 
4081 ,0          ,12   ,11   ,23        ,4.304         ,5.285              ,0.814 
4081 ,0          ,12   ,13   ,0         ,4.971         ,5.321              ,0.934 
4081 ,0          ,12   ,13   ,23        ,4.944         ,5.26               ,0.94  
4081 ,0          ,13   ,12   ,0         ,4.302         ,5.298              ,0.812 
4081 ,0          ,13   ,12   ,23        ,4.296         ,5.238              ,0.82  
4081 ,0          ,13   ,14   ,0         ,4.933         ,5.278              ,0.935 
4081 ,0          ,13   ,14   ,23        ,4.963         ,5.356              ,0.926 
4081 ,0          ,14   ,13   ,0         ,4.292         ,5.262              ,0.816 
4081 ,0          ,14   ,13   ,23        ,4.337         ,5.342              ,0.812 
4081 ,0          ,14   ,15   ,0         ,4.899         ,5.254              ,0.932 
4081 ,0          ,14   ,15   ,23        ,4.955         ,5.272              ,0.94  
4081 ,0          ,15   ,14   ,0         ,4.327         ,5.284              ,0.819 
4081 ,0          ,15   ,14   ,23        ,4.327         ,5.382              ,0.804 
4081 ,0          ,15   ,16   ,0         ,4.939         ,5.28               ,0.935 
4081 ,0          ,15   ,16   ,23        ,4.986         ,5.275              ,0.945 
4081 ,0          ,16   ,15   ,0         ,5.696         ,7.264              ,0.784 
4081 ,0          ,16   ,15   ,23        ,5.642         ,7.302              ,0.773 
4081 ,0          ,16   ,17   ,0         ,5.603         ,7.975              ,0.703 
4081 ,0          ,16   ,17   ,23        ,5.635         ,7.971              ,0.707 
4081 ,0          ,17   ,16   ,0         ,5.659         ,7.294              ,0.776 
4081 ,0          ,17   ,16   ,23        ,5.716         ,7.371              ,0.775 
4081 ,0          ,17   ,18   ,0         ,5.602         ,7.928              ,0.707 
4081 ,0          ,17   ,18   ,23        ,5.65          ,7.964              ,0.709 
4081 ,0          ,18   ,17   ,0         ,5.697         ,7.34               ,0.776 
4081 ,0          ,18   ,17   ,23        ,5.647         ,7.265              ,0.777 
4081 ,0          ,18   ,19   ,0         ,5.587         ,7.918              ,0.706 
4081 ,0          ,18   ,19   ,23        ,5.625         ,8.091              ,0.695 
4081 ,0          ,19   ,18   ,0         ,5.645         ,7.312              ,0.772 
4081 ,0          ,19   ,18   ,23        ,5.711         ,7.357              ,0.776 
4081 ,0          ,19   ,20   ,0         ,5.572         ,7.979              ,0.698 
4081 ,0          ,19   ,20   ,23        ,5.649         ,7.944              ,0.711 
4081 ,0          ,2    ,1    ,0         ,4.2           ,5.012              ,0.838 
4081 ,0          ,2    ,1    ,23        ,3.979         ,4.597              ,0.865 
4081 ,0          ,2    ,3    ,0         ,5.245         ,5.274              ,0.994 
4081 ,0          ,2    ,3    ,23        ,5.27          ,5.303              ,0.994 
4081 ,0          ,20   ,19   ,0         ,5.646         ,7.264              ,0.777 
4081 ,0          ,20   ,19   ,23        ,5.649         ,7.373              ,0.766 
4081 ,0          ,20   ,21   ,0         ,5.583         ,7.914              ,0.705 
4081 ,0          ,20   ,21   ,23        ,5.614         ,7.952              ,0.706 
4081 ,0          ,21   ,20   ,0         ,5.64          ,7.308              ,0.772 
4081 ,0          ,21   ,20   ,23        ,5.657         ,7.283              ,0.777 
4081 ,0          ,21   ,22   ,0         ,5.592         ,7.854              ,0.712 
4081 ,0          ,21   ,22   ,23        ,5.592         ,7.881              ,0.71  
4081 ,0          ,22   ,21   ,0         ,5.653         ,7.219              ,0.783 
4081 ,0          ,22   ,21   ,23        ,5.628         ,7.21               ,0.781 
4081 ,0          ,22   ,23   ,0         ,5.633         ,7.904              ,0.713 
4081 ,0          ,22   ,23   ,23        ,5.634         ,7.902              ,0.713 
4081 ,0          ,23   ,22   ,0         ,5.658         ,7.27               ,0.778 
4081 ,0          ,23   ,22   ,23        ,5.653         ,7.243              ,0.78  
4081 ,0          ,23   ,24   ,0         ,5.546         ,7.838              ,0.708 
4081 ,0          ,23   ,24   ,23        ,5.574         ,7.876              ,0.708 
4081 ,0          ,24   ,23   ,0         ,5.641         ,7.303              ,0.772 
4081 ,0          ,24   ,23   ,23        ,5.645         ,7.225              ,0.781 
4081 ,0          ,24   ,25   ,0         ,5.566         ,7.864              ,0.708 
4081 ,0          ,24   ,25   ,23        ,5.555         ,7.879              ,0.705 
4081 ,0          ,25   ,24   ,0         ,5.603         ,7.182              ,0.78  
4081 ,0          ,25   ,24   ,23        ,5.604         ,7.186              ,0.78  
4081 ,0          ,25   ,26   ,0         ,5.498         ,7.79               ,0.706 
4081 ,0          ,25   ,26   ,23        ,5.503         ,7.781              ,0.707 
4081 ,0          ,256  ,128  ,23        ,6.564         ,7.033              ,0.933 
4081 ,0          ,256  ,160  ,23        ,8.062         ,8.228              ,0.98  
4081 ,0          ,256  ,192  ,23        ,8.183         ,8.162              ,1.003 
4081 ,0          ,256  ,224  ,23        ,9.406         ,9.034              ,1.041 
4081 ,0          ,256  ,32   ,23        ,5.45          ,6.315              ,0.863 
4081 ,0          ,256  ,64   ,0         ,5.398         ,5.967              ,0.905 
4081 ,0          ,256  ,64   ,23        ,5.557         ,6.259              ,0.888 
4081 ,0          ,256  ,96   ,23        ,6.277         ,6.661              ,0.942 
4081 ,0          ,26   ,25   ,0         ,5.616         ,7.212              ,0.779 
4081 ,0          ,26   ,25   ,23        ,5.586         ,7.134              ,0.783 
4081 ,0          ,26   ,27   ,0         ,5.467         ,7.724              ,0.708 
4081 ,0          ,26   ,27   ,23        ,5.453         ,7.743              ,0.704 
4081 ,0          ,27   ,26   ,0         ,5.56          ,7.131              ,0.78  
4081 ,0          ,27   ,26   ,23        ,5.559         ,7.112              ,0.782 
4081 ,0          ,27   ,28   ,0         ,5.459         ,7.804              ,0.699 
4081 ,0          ,27   ,28   ,23        ,5.454         ,7.837              ,0.696 
4081 ,0          ,28   ,27   ,0         ,5.599         ,7.209              ,0.777 
4081 ,0          ,28   ,27   ,23        ,5.531         ,7.126              ,0.776 
4081 ,0          ,28   ,29   ,0         ,5.458         ,7.795              ,0.7   
4081 ,0          ,28   ,29   ,23        ,5.467         ,7.69               ,0.711 
4081 ,0          ,29   ,28   ,0         ,5.563         ,7.19               ,0.774 
4081 ,0          ,29   ,28   ,23        ,5.536         ,7.119              ,0.778 
4081 ,0          ,29   ,30   ,0         ,5.464         ,7.727              ,0.707 
4081 ,0          ,29   ,30   ,23        ,5.507         ,7.707              ,0.715 
4081 ,0          ,3    ,2    ,0         ,4.347         ,5.331              ,0.815 
4081 ,0          ,3    ,2    ,23        ,4.366         ,5.319              ,0.821 
4081 ,0          ,3    ,4    ,0         ,4.968         ,5.147              ,0.965 
4081 ,0          ,3    ,4    ,23        ,4.972         ,5.04               ,0.987 
4081 ,0          ,30   ,29   ,0         ,5.589         ,7.146              ,0.782 
4081 ,0          ,30   ,29   ,23        ,5.561         ,7.145              ,0.778 
4081 ,0          ,30   ,31   ,0         ,5.453         ,7.709              ,0.707 
4081 ,0          ,30   ,31   ,23        ,5.441         ,7.687              ,0.708 
4081 ,0          ,31   ,30   ,0         ,5.498         ,7.059              ,0.779 
4081 ,0          ,31   ,30   ,23        ,5.52          ,7.076              ,0.78  
4081 ,0          ,32   ,31   ,0         ,5.496         ,7.072              ,0.777 
4081 ,0          ,32   ,31   ,23        ,5.506         ,7.113              ,0.774 
4081 ,0          ,4    ,3    ,0         ,4.341         ,5.298              ,0.819 
4081 ,0          ,4    ,3    ,23        ,4.333         ,5.34               ,0.811 
4081 ,0          ,4    ,5    ,0         ,4.968         ,5.179              ,0.959 
4081 ,0          ,4    ,5    ,23        ,4.984         ,5.108              ,0.976 
4081 ,0          ,5    ,4    ,0         ,4.327         ,5.31               ,0.815 
4081 ,0          ,5    ,4    ,23        ,4.345         ,5.274              ,0.824 
4081 ,0          ,5    ,6    ,0         ,4.907         ,5.312              ,0.924 
4081 ,0          ,5    ,6    ,23        ,4.935         ,5.239              ,0.942 
4081 ,0          ,6    ,5    ,0         ,4.335         ,5.322              ,0.815 
4081 ,0          ,6    ,5    ,23        ,4.337         ,5.272              ,0.823 
4081 ,0          ,6    ,7    ,0         ,4.929         ,5.278              ,0.934 
4081 ,0          ,6    ,7    ,23        ,4.956         ,5.192              ,0.954 
4081 ,0          ,7    ,6    ,0         ,4.307         ,5.273              ,0.817 
4081 ,0          ,7    ,6    ,23        ,4.263         ,5.198              ,0.82  
4081 ,0          ,7    ,8    ,0         ,4.941         ,5.263              ,0.939 
4081 ,0          ,7    ,8    ,23        ,4.975         ,5.301              ,0.939 
4081 ,0          ,8    ,7    ,0         ,4.315         ,5.236              ,0.824 
4081 ,0          ,8    ,7    ,23        ,4.312         ,5.331              ,0.809 
4081 ,0          ,8    ,9    ,0         ,4.97          ,5.327              ,0.933 
4081 ,0          ,8    ,9    ,23        ,4.953         ,5.266              ,0.941 
4081 ,0          ,9    ,10   ,0         ,4.941         ,5.297              ,0.933 
4081 ,0          ,9    ,10   ,23        ,4.959         ,5.303              ,0.935 
4081 ,0          ,9    ,8    ,0         ,4.314         ,5.283              ,0.817 
4081 ,0          ,9    ,8    ,23        ,4.331         ,5.283              ,0.82  
5    ,0          ,192  ,160  ,0         ,7.739         ,7.265              ,1.065 
5    ,0          ,192  ,160  ,23        ,7.878         ,7.41               ,1.063 
5    ,0          ,256  ,160  ,0         ,7.5           ,7.28               ,1.03  
5    ,0          ,256  ,160  ,23        ,7.693         ,7.228              ,1.064 
5    ,0          ,256  ,64   ,0         ,5.195         ,5.353              ,0.97  
5    ,0          ,256  ,64   ,23        ,5.142         ,5.359              ,0.96  
5    ,0          ,4    ,5    ,0         ,3.508         ,3.534              ,0.993 
5    ,0          ,4    ,5    ,23        ,3.506         ,3.532              ,0.993 
5    ,0          ,512  ,160  ,0         ,9.026         ,9.23               ,0.978 
5    ,0          ,512  ,160  ,23        ,9.133         ,9.441              ,0.967 
5    ,0          ,6    ,5    ,0         ,3.575         ,3.729              ,0.959 
5    ,0          ,6    ,5    ,23        ,3.556         ,3.791              ,0.938 
6    ,0          ,192  ,192  ,0         ,7.969         ,7.958              ,1.001 
6    ,0          ,192  ,192  ,23        ,8.081         ,7.991              ,1.011 
6    ,0          ,256  ,192  ,0         ,7.801         ,7.655              ,1.019 
6    ,0          ,256  ,192  ,23        ,7.927         ,7.813              ,1.015 
6    ,0          ,256  ,64   ,0         ,5.218         ,5.435              ,0.96  
6    ,0          ,256  ,64   ,23        ,5.112         ,5.372              ,0.952 
6    ,0          ,5    ,6    ,0         ,3.491         ,3.684              ,0.948 
6    ,0          ,5    ,6    ,23        ,3.483         ,3.718              ,0.937 
6    ,0          ,512  ,192  ,0         ,9.568         ,9.86               ,0.97  
6    ,0          ,512  ,192  ,23        ,9.556         ,9.693              ,0.986 
6    ,0          ,7    ,6    ,0         ,3.631         ,3.739              ,0.971 
6    ,0          ,7    ,6    ,23        ,3.614         ,3.865              ,0.935 
7    ,0          ,192  ,224  ,0         ,7.997         ,7.814              ,1.023 
7    ,0          ,192  ,224  ,23        ,7.919         ,7.82               ,1.013 
7    ,0          ,256  ,224  ,0         ,8.76          ,8.428              ,1.039 
7    ,0          ,256  ,224  ,23        ,8.73          ,8.474              ,1.03  
7    ,0          ,256  ,64   ,0         ,5.074         ,5.389              ,0.942 
7    ,0          ,256  ,64   ,23        ,5.123         ,5.229              ,0.98  
7    ,0          ,512  ,224  ,0         ,9.416         ,9.45               ,0.996 
7    ,0          ,512  ,224  ,23        ,9.405         ,9.482              ,0.992 
7    ,0          ,6    ,7    ,0         ,3.498         ,3.75               ,0.933 
7    ,0          ,6    ,7    ,23        ,3.49          ,3.738              ,0.934 
7    ,0          ,8    ,7    ,0         ,3.631         ,3.773              ,0.962 
7    ,0          ,8    ,7    ,23        ,3.622         ,3.79               ,0.956 
8    ,0          ,7    ,8    ,0         ,3.498         ,3.761              ,0.93  
8    ,0          ,7    ,8    ,23        ,3.489         ,3.785              ,0.922 
8    ,0          ,9    ,8    ,0         ,3.606         ,3.782              ,0.953 
8    ,0          ,9    ,8    ,23        ,3.604         ,3.85               ,0.936 
9    ,0          ,10   ,9    ,0         ,3.589         ,3.84               ,0.935 
9    ,0          ,10   ,9    ,23        ,3.624         ,3.814              ,0.95  
9    ,0          ,8    ,9    ,0         ,3.508         ,3.822              ,0.918 
9    ,0          ,8    ,9    ,23        ,3.5           ,3.793              ,0.923 
0.9281712548418259
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@ 
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
 
 # ifdef USE_AS_WMEMCHR
+#  define PC_SHIFT_GPR	rcx
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
+#  define PC_SHIFT_GPR	rdi
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
@@ -39,534 +49,661 @@ 
 #  define CHAR_SIZE	1
 # endif
 
-	/* In the 4x loop the RTM and non-RTM versions have data pointer
-	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
-	   This is represented by BASE_OFFSET. As well because the RTM
-	   version uses vpcmp which stores a bit per element compared where
-	   the non-RTM version uses vpcmpeq which stores a bit per byte
-	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
-	   version.  */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
 #  define VZEROUPPER
-#  define BASE_OFFSET	(VEC_SIZE * 4)
-#  define RET_SCALE	CHAR_SIZE
+
+#  define USE_TERN_IN_LOOP	0
 # else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
-#  define BASE_OFFSET	0
-#  define RET_SCALE	1
 # endif
 
-	/* In the return from 4x loop memchr and rawmemchr versions have
-	   data pointers off by VEC_SIZE * 4 with memchr version being
-	   VEC_SIZE * 4 greater.  */
-# ifdef USE_AS_RAWMEMCHR
-#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
-#  define RAW_PTR_REG	rcx
-#  define ALGN_PTR_REG	rdi
+# if USE_TERN_IN_LOOP
+	/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+	   so we don't want to multiply resulting index.  */
+#  define TERN_CHAR_MULT	1
+
+#  ifdef USE_AS_WMEMCHR
+#   define TEST_END()	inc %VRCX
+#  else
+#   define TEST_END()	add %rdx, %rcx
+#  endif
 # else
-#  define RET_OFFSET	BASE_OFFSET
-#  define RAW_PTR_REG	rdi
-#  define ALGN_PTR_REG	rcx
+#  define TERN_CHAR_MULT	CHAR_SIZE
+#  define TEST_END()	KORTEST %k2, %k3
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+#  ifndef USE_AS_WMEMCHR
+#   define GPR_X0_IS_RET	1
+#  else
+#   define GPR_X0_IS_RET	0
+#  endif
+#  define GPR_X0	rax
+# else
+#  define GPR_X0_IS_RET	0
+#  define GPR_X0	rdx
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+# else
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+#  define MASK_GPR(...)	VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 16)
+# else
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 8)
 # endif
 
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
 
-	.section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE	4096
+
+
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
-	jz	L(zero)
+	jz	L(zero_0)
 
-#  ifdef __ILP32__
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
-#  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	VPBROADCAST %esi, %VMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+	/* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	tzcnt	%VRAX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRAX, %VRAX
+	jz	L(more_1x_vec)
+	leaq	(%rdi, %rsi), %rax
+# else
 
-	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* If length < CHAR_PER_VEC handle special.  */
+	/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+	   > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 	cmpq	$CHAR_PER_VEC, %rdx
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	ja	L(more_1x_vec)
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(zero_0)
+L(first_vec_x0_ret):
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
 # endif
 	ret
 
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Check if first match was before length. NB: tzcnt has false data-
-	   dependency on destination. eax already had a data-dependency on esi
-	   so this should have no affect here.  */
-	tzcntl	%eax, %esi
-#  ifdef USE_AS_WMEMCHR
-	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
-#  else
-	addq	%rsi, %rdi
-#  endif
+	/* Only fits in first cache line for VEC_SIZE == 32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
+L(zero_0):
 	xorl	%eax, %eax
-	cmpl	%esi, %edx
-	cmovg	%rdi, %rax
 	ret
 # endif
 
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
-	   for rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
-	kmovd	%k0, %r8d
+	.p2align 4,, 9
+L(more_1x_vec):
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	sarl	$2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
-	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
-	subl	%eax, %esi
+	/* If wmemchr still need to test if there was a match in first
+	   VEC.  Use bsf to test here so we can reuse
+	   L(first_vec_x0_ret).  */
+	bsf	%VRAX, %VRAX
+	jnz	L(first_vec_x0_ret)
 # endif
+
+L(page_cross_continue):
 # ifdef USE_AS_WMEMCHR
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+	/* We can't use end of the buffer to re-calculate length for
+	   wmemchr as len * CHAR_SIZE may overflow.  */
+	leaq	-(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	addq	%rdx, %rax
+# else
+	leaq	-(VEC_SIZE + 1)(%rdx, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
+
+	/* rax contains remaining length - 1.  -1 so we can get imm8
+	   encoding in a few additional places saving code size.  */
+
+	/* Needed regardless of remaining length.  */
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+
+	/* We cannot fold the above `sub %rdi, %rax` with the `cmp
+	   $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+	   large length to overflow and cause the subtract to carry
+	   despite length being above CHAR_PER_VEC * 2.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rax
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1_check)
+
+	/* Check the end of data.  NB: use 8-bit operations to save code
+	   size.  We no longer need the full-width of eax and will
+	   perform a write-only operation over eax so there will be no
+	   partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 1 - 1), %al
+	jle	L(zero_0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_0)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+
+	/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
+	   fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+	   not enough space before the next cache line to fit the `lea`
+	   for return.  */
+# if VEC_SIZE == 64
+	ja	L(first_vec_x2_ret)
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 # else
-	addq	%RAW_PTR_REG, %rax
+	jbe	L(zero_0)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
 # endif
+
+	.p2align 4,, 5
+L(first_vec_x1_check):
+	bsf	%VRDX, %VRDX
+	cmpb	%dl, %al
+	jb	L(zero_4)
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 32.
+	 */
+# if VEC_SIZE == 32
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
+
+	.p2align 4,, 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	bsf	%VRCX, %VRCX
+L(first_vec_x2_ret):
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 4
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 5
-L(aligned_more):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Align data to VEC_SIZE.  */
-L(cross_page_continue):
-	xorl	%ecx, %ecx
-	subl	%edi, %ecx
-	andq	$-VEC_SIZE, %rdi
-	/* esi is for adjusting length to see if near the end.  */
-	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %esi
-#  endif
-# else
-	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length. If near end handle specially.  */
-	subq	%rsi, %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	testl	%eax, %eax
+	.p2align 4,, 5
+L(more_2x_vec):
+	/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+	   length.  */
+
+
+	/* Already computed matches for first VEC in rdx.  */
+	test	%VRDX, %VRDX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Needed regardless of next length check.  */
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if we are near the end.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rax
+	ja	L(more_4x_vec)
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	/* Use 8-bit instructions to save code size.  We won't use full-
+	   width eax again and will perform a write-only operation to
+	   eax so no worries about partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 3), %al
+	jb	L(zero_2)
+L(last_vec_check):
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_2)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+	jae	L(first_vec_x4_ret)
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	   For VEC_SIZE == 32 we put the return label at the end of
+	   L(first_vec_x4).  */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(first_vec_x4):
+	bsf	%VRCX, %VRCX
+# if VEC_SIZE == 32
+	/* Place L(first_vec_x4_ret) here as we can't fit it in the same
+	   cache line as where it is called from so we might as well
+	   save code size by reusing return of L(first_vec_x4).  */
+L(first_vec_x4_ret):
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3_check):
+	/* Need to adjust remaining length before checking.  */
+	addb	$-(CHAR_PER_VEC * 2), %al
+	bsf	%VRCX, %VRCX
+	cmpb	%cl, %al
+	jb	L(zero_2)
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+	.p2align 4,, 10
+# endif
+L(more_4x_vec):
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x4)
 
+	subq	$-(VEC_SIZE * 5), %rdi
+	subq	$(CHAR_PER_VEC * 8), %rax
+	jb	L(last_4x_vec)
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Check if at last CHAR_PER_VEC * 4 length.  */
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(last_4x_vec_or_less_cmpeq)
-	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
-	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
-	 */
-#  ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+# else
+	addq	%rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+	/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+	   processor has partial register stalls (all have merging
+	   uop). If that changes this can be removed.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
 	subl	%edi, %ecx
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
-	addq	%rcx, %rdx
-#  else
-	addq	%rdi, %rdx
-	andq	$-(4 * VEC_SIZE), %rdi
-	subq	%rdi, %rdx
-#  endif
+	addq	%rcx, %rax
 # else
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rax
 # endif
-# ifdef USE_IN_RTM
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-# else
-	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
-	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
 # endif
 
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
+	.p2align 4,, 11
 L(loop_4x_vec):
-	/* Two versions of the loop. One that does not require
-	   vzeroupper by not using ymm0-ymm15 and another does that require
-	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
-	   is used at all is because there is no EVEX encoding vpcmpeq and
-	   with vpcmpeq this loop can be performed more efficiently. The
-	   non-vzeroupper version is safe for RTM while the vzeroupper
-	   version should be prefered if RTM are not supported.  */
-# ifdef USE_IN_RTM
-	/* It would be possible to save some instructions using 4x VPCMP
-	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
-	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
-	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-# else
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymmm0-15 and another does that
+	   require vzeroupper because it uses ymmm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
-	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	   instruction using not equals mask on vptern with wmemchr.
+	 */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
-	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 #  ifdef USE_AS_WMEMCHR
-	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
-	   combines result from VEC0 with zero mask.  */
-	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
-	vpmovmskb %ymm4, %ecx
+	/* This takes the not of or between VEC_lo(2), VEC_lo(3),
+	   VEC_lo(4) as well as combines result from VEC(0) with zero
+	   mask.  */
+	vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+	vpmovmskb %VMM_lo(4), %VRCX
 #  else
-	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
-	vpternlogd $254, %ymm2, %ymm3, %ymm4
-	vpmovmskb %ymm4, %ecx
-	kmovd	%k1, %eax
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+	KMOV	%k1, %edx
 #  endif
-# endif
 
-# ifdef USE_AS_RAWMEMCHR
-	subq	$-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
-	kortestd %k2, %k3
 # else
-#  ifdef USE_AS_WMEMCHR
-	/* ecx contains not of matches. All 1s means no matches. incl will
-	   overflow and set zeroflag if that is the case.  */
-	incl	%ecx
-#  else
-	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
-	   to ecx is not an issue because if eax is non-zero it will be
-	   used for returning the match. If it is zero the add does
-	   nothing.  */
-	addq	%rax, %rcx
-#  endif
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
 # endif
-# ifdef USE_AS_RAWMEMCHR
-	jz	L(loop_4x_vec)
-# else
-	jnz	L(loop_4x_vec_end)
+
+
+	TEST_END ()
+	jnz	L(loop_vec_ret)
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop_4x_vec)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jae	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
+	/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
-	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
-	kmovd	%k0, %eax
-	VZEROUPPER
-
-L(last_4x_vec_or_less):
-	/* Check if first VEC contained match.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	COND_VZEROUPPER
 
-	/* If remaining length > CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jg	L(last_4x_vec)
-
-L(last_2x_vec):
-	/* If remaining length < CHAR_PER_VEC.  */
-	addl	$CHAR_PER_VEC, %edx
-	jle	L(zero_end)
-
-	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
-	ret
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+	   instructions on eax from here on out.  */
+# if CHAR_PER_VEC != 64
+	andl	$(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+	subq	$(VEC_SIZE * 1), %rdi
+	KMOV	%k0, %VRDX
+	cmpb	$(CHAR_PER_VEC * 2 - 1), %al
+	jbe	L(last_2x_vec)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	subb	$(CHAR_PER_VEC * 3), %al
+	jae	L(last_vec_check)
 
-L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(first_vec_x1_check):
-	/* eax must be non-zero. Use bsfl to save code size.  */
-	bsfl	%eax, %eax
-	/* Adjust length.  */
-	subl	$-(CHAR_PER_VEC * 4), %edx
-	/* Check if match within remaining length.  */
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+	addq	$VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
+# endif
 
-	.p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+	   64 it needs a seperate return label.  */
+	.p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+	ret
 # endif
-	/* rawmemchr will fall through into this if match was found in
-	   loop.  */
 
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
-	/* k1 has not of matches with VEC1.  */
-	kmovd	%k1, %eax
-#  ifdef USE_AS_WMEMCHR
-	subl	$((1 << CHAR_PER_VEC) - 1), %eax
-#  else
-	incl	%eax
-#  endif
+	.p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+	KMOV	%k1, %VRAX
+	inc	%MASK_GPR(rax)
 # else
-	/* eax already has matches for VEC1.  */
-	testl	%eax, %eax
+	test	%VRDX, %VRDX
 # endif
-	jnz	L(last_vec_x1_return)
+	jnz	L(last_vec_x0)
 
-# ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %eax
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRDX
 # else
-	vpmovmskb %ymm2, %eax
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRDX
 # endif
-	testl	%eax, %eax
-	jnz	L(last_vec_x2_return)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
 
-# ifdef USE_IN_RTM
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_return)
 
-	kmovd	%k3, %eax
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRDX
 # else
-	vpmovmskb %ymm3, %eax
-	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
-	salq	$VEC_SIZE, %rcx
-	orq	%rcx, %rax
-	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	KMOV	%k2, %VRDX
 # endif
-	ret
 
-	.p2align 4,, 10
-L(last_vec_x1_return):
-	tzcntl	%eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
+	   CHAR_PER_VEC we test the last 2x VEC seperately, for
+	   CHAR_PER_VEC <= 32 we can combine the results from the 2x
+	   VEC in a single GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k3, %VRDX
 # else
-	addq	%rdi, %rax
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$(VEC_SIZE / TERN_CHAR_MULT), %rcx
+	addq	%rcx, %rdx
+#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+#  endif
 # endif
-	VZEROUPPER
+	bsf	%rdx, %rdx
+	leaq	(LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
-	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
-	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
-	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-# ifdef USE_IN_RTM
-	.p2align 4
-L(last_vec_x3_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+	.p2align 4,, 4
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+	addq	%rdi, %rax
+# else
+	leaq	(%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
 	ret
+
+	.p2align 4,, 6
+L(page_cross):
+	/* Need to preserve eax to compute inbound bytes we are
+	   checking.  */
+# ifdef USE_AS_WMEMCHR
+	movl	%eax, %ecx
+# else
+	xorl	%ecx, %ecx
+	subl	%eax, %ecx
 # endif
 
-# ifndef USE_AS_RAWMEMCHR
-	.p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Check first VEC regardless.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
 
-	/* If remaining length <= CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jle	L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
 
-	.p2align 4
-L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
 
+	shrx	%VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Create mask for possible matches within remaining length.  */
-#  ifdef USE_AS_WMEMCHR
-	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
-	bzhil	%edx, %ecx, %ecx
-#  else
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-#  endif
-	/* Test matches in data against length match.  */
-	andl	%ecx, %eax
-	jnz	L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+	negl	%ecx
+# endif
 
-	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
-	   remaining length was found to be > CHAR_PER_VEC * 2.  */
-	subl	$CHAR_PER_VEC, %edx
-	jbe	L(zero_end2)
+	/* mask lower bits from ecx (negative eax) to get bytes till
+	   next VEC.  */
+	andl	$(CHAR_PER_VEC - 1), %ecx
 
+	/* Check if VEC is entirely contained in the remainder of the
+	   page.  */
+	cmpq	%rcx, %rdx
+	jbe	L(page_cross_ret)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Shift remaining length mask for last VEC.  */
-#  ifdef USE_AS_WMEMCHR
-	shrl	$CHAR_PER_VEC, %ecx
-#  else
-	shrq	$CHAR_PER_VEC, %rcx
-#  endif
-	andl	%ecx, %eax
-	jz	L(zero_end2)
-	bsfl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
-	ret
+	/* Length crosses the page so if rax is zero (no matches)
+	   continue.  */
+	test	%VRAX, %VRAX
+	jz	L(page_cross_continue)
 
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	/* if rdx > rcx then any match here must be in [buf:buf + len].
+	 */
+	tzcnt	%VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 2
+L(page_cross_zero):
+	xorl	%eax, %eax
 	ret
+
+	.p2align 4,, 4
+L(page_cross_ret):
+	/* Search is entirely contained in page cross case.  */
+# ifdef USE_AS_WMEMCHR
+	test	%VRAX, %VRAX
+	jz	L(page_cross_zero)
+# endif
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
-	/* 7 bytes from next cache line.  */
+	ret
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@ 
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR	__rawmemchr_evex_rtm
+
+#define USE_IN_RTM	1
+#define SECTION(p)	p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@ 
-#ifndef RAWMEMCHR
-# define RAWMEMCHR	__rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR	1
-#define MEMCHR	RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+#  define RAWMEMCHR	__rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR	rdi
+# define REG_WIDTH	VEC_SIZE
+# define VPTESTN	vptestnmb
+# define VPBROADCAST	vpbroadcastb
+# define VPMINU	vpminub
+# define VPCMP	vpcmpb
+# define VPCMPEQ	vpcmpeqb
+# define CHAR_SIZE	1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
+#  define VZEROUPPER
+
+#  define USE_TERN_IN_LOOP	0
+# else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define CHAR_PER_VEC	VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else	/* !(CHAR_PER_VEC == 64) */
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif	/* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+	VPBROADCAST %esi, %VMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	test	%VRAX, %VRAX
+	jz	L(aligned_more)
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4,, 4
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 4)(%rdi, %rax), %rax
+	ret
 
-#include "memchr-evex.S"
+	/* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+	   as well place it more locally.  For VEC_SIZE == 64 we reuse
+	   return code at the end of loop's return.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(page_cross):
+	/* eax has lower page-offset bits of rdi so xor will zero them
+	   out.  */
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	/* Shift out out-of-bounds matches.  */
+	shrx	%VRDI, %VRAX, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	.p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+	/* Align pointer.  */
+	andq	$(VEC_SIZE * -1), %rdi
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
+
+	subq	$-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymm0-15 and another does that
+	   require vzeroupper because it uses ymm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	subq	$(VEC_SIZE * -4), %rdi
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+
+	KMOV	%k1, %eax
+
+	/* NB:  rax has match from first VEC and rcx has matches from
+	   VEC 2-4.  If rax is non-zero we will return that match.  If
+	   rax is zero adding won't disturb the bits in rcx.  */
+	add	%rax, %rcx
+# else
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST %k2, %k3
+# endif
+	jz	L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+	test	%VRAX, %VRAX
+# else
+	KMOV	%k1, %VRAX
+	inc	%VRAX
+# endif
+	jnz	L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRAX
+# else
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+# endif
+	test	%VRAX, %VRAX
+	jnz	L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRAX
+# else
+	KMOV	%k2, %VRAX
+# endif
+
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$CHAR_PER_VEC, %rcx
+	addq	%rcx, %rax
+# endif
+	bsf	%rax, %rax
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(TAIL_RETURN_LBL):
+	bsf	%rax, %rax
+	leaq	(TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+END (RAWMEMCHR)
+#endif