Message ID | 20220922002451.4039857-1-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86_64: Implement evex512 version of strrchr and wcsrchr | expand |
On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. > Please attach benchmark numbers. > - strrchr function using 512 bit vectors. > - wcsrchr function using 512 bit vectors. > > Code size data: > > strrchr-evex.o 833 byte > strrchr-evex512.o 573 byte (-31%) > > wcsrchr-evex.o 836 byte > wcsrchr-evex512.o 581 byte (-31%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 2 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > 5 files changed, 334 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..6a275f1c3d 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -110,6 +110,7 @@ sysdep_routines += \ > strrchr-avx2 \ > strrchr-avx2-rtm \ > strrchr-evex \ > + strrchr-evex512 \ > strrchr-sse2 \ > strspn-sse4 \ > strstr-avx512 \ > @@ -152,6 +153,7 @@ sysdep_routines += \ > wcsrchr-avx2 \ > wcsrchr-avx2-rtm \ > wcsrchr-evex \ > + wcsrchr-evex512 \ > wcsrchr-sse2 \ > wmemchr-avx2 \ > wmemchr-avx2-rtm \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index a71444eccb..26c941023a 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strrchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __strrchr_evex512) > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > CPU_FEATURE_USABLE (AVX2), > __strrchr_avx2) > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wcsrchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW) > + && CPU_FEATURE_USABLE (BMI2)), > + __wcsrchr_evex512) > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > CPU_FEATURE_USABLE (AVX2), > __wcsrchr_avx2) > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > new file mode 100644 > index 0000000000..e937cb193c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > @@ -0,0 +1,307 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WCSRCHR > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMP vpcmpd > +# define VPMINU vpminud > +# define VPTESTN vptestnmd > +# else > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMP vpcmpb > +# define VPMINU vpminub > +# define VPTESTN vptestnmb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# if VEC_SIZE == 64 > +# define BLSMSK blsmskq > +# define BSR bsrq > +# define KMOV kmovq > +# define KOR korq > +# define KORTEST kortestq > +# define R8 r8 > +# define RAX rax > +# define RCX rcx > +# define RDX rdx > +# define SHR shrq > +# define TEXTSUFFIX evex512 > +# define VMM0 zmm16 > +# define VMM1 zmm17 > +# define VMM2 zmm18 > +# define VMM3 zmm19 > +# define VMM4 zmm20 > +# define VMM5 zmm21 > +# define VMOVA vmovdqa64 > +# define VMOVU vmovdqu64 > + > +# elif VEC_SIZE == 32 > +/* Currently Unused. */ > +# define BLSMSK blsmskl > +# define BSR bsrl > +# define KMOV kmovd > +# define KOR kord > +# define KORTEST kortestd > +# define R8 r8d > +# define RAX eax > +# define RCX ecx > +# define RDX edx > +# define SHR shrl > +# define TEXTSUFFIX evex256 > +# define VMM0 ymm16 > +# define VMM1 ymm17 > +# define VMM2 ymm18 > +# define VMM3 ymm19 > +# define VMM4 ymm20 > +# define VMM5 ymm21 > +# define VMOVA vmovdqa32 > +# define VMOVU vmovdqu32 > +# endif > + > + .section .text.TEXTSUFFIX, "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (STRRCHR, 6) > + > + /* Broadcast CHAR to VMM0. */ > + VPBROADCAST %esi, %VMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + /* Compare [w]char for null, mask bit will be set for match. */ > + VMOVU (%rdi), %VMM1 > + > + VPTESTN %VMM1, %VMM1, %k1 > + KMOV %k1, %RCX > + test %RCX, %RCX > + jz L(align_more) > + > + VPCMP $0, %VMM1, %VMM0, %k0 > + KMOV %k0, %RAX > + BLSMSK %RCX, %RCX > + and %RCX, %RAX > + jz L(ret) > + > + BSR %RAX, %RAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > +L(ret): > + ret > + > +L(vector_x2_end): > + VPCMP $0, %VMM2, %VMM0, %k2 > + KMOV %k2, %RAX > + BLSMSK %RCX, %RCX > + and %RCX, %RAX > + jz L(vector_x1_ret) > + > + BSR %RAX, %RAX > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + /* Check the first vector at very last to look for match. */ > +L(vector_x1_ret): > + VPCMP $0, %VMM1, %VMM0, %k2 > + KMOV %k2, %RAX > + test %RAX, %RAX > + jz L(ret) > + > + BSR %RAX, %RAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rsi, %rax, CHAR_SIZE), %rax > +# else > + add %rsi, %rax > +# endif > + ret > + > +L(align_more): > + /* Zero r8 to store match result. */ > + xorq %r8, %r8 > + /* Save pointer of first vector, in case if no match found. */ > + movq %rdi, %rsi > + /* Align pointer to vector size. */ > + andq $-VEC_SIZE, %rdi > + /* Loop unroll 2 times for 2 vector loop. */ > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(vector_x2_end) > + > + /* Save pointer of second vector, in case if no match > + found. */ > + movq %rdi, %r9 > + /* Align address to VEC_SIZE * 2 for loop. */ > + andq $-(VEC_SIZE * 2), %rdi > + > + .p2align 4,,11 > +L(loop): > + /* 2 vector loop, as it provide better performance as compared > + to 4 vector loop. */ > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > + VPCMP $0, %VMM3, %VMM0, %k1 > + VPCMP $0, %VMM4, %VMM0, %k2 > + VPMINU %VMM3, %VMM4, %VMM5 > + VPTESTN %VMM5, %VMM5, %k0 > + KOR %k1, %k2, %k3 > + subq $-(VEC_SIZE * 2), %rdi > + /* If k0 and k3 zero, match and end of string not found. */ > + KORTEST %k0, %k3 > + jz L(loop) > + > + /* If k0 is non zero, end of string found. */ > + KORTEST %k0, %k0 > + jnz L(endloop) > + > + /* A match found, it need to be stored in r8 before loop > + continue. */ > + /* Check second vector first. */ > + KMOV %k2, %RDX > + test %RDX, %RDX > + jz L(loop_vec_x3_ret) > + > + BSR %RDX, %RDX > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > + jmp L(loop) > + > + /* If second vector doesn't have match, first vector must > + have match. */ > +L(loop_vec_x3_ret): > + KMOV %k1, %R8 > + BSR %R8, %R8 > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > +# else > + add %rdi, %r8 > +# endif > + jmp L(loop) > + > +L(endloop): > + /* Check if string end in first loop vector. */ > + VPTESTN %VMM3, %VMM3, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(vector_x3_end) > + > + /* Check if it has match in first loop vector. */ > + KMOV %k1, %RAX > + test %RAX, %RAX > + jz L(vector_x4_end) > + > + BSR %RAX, %RAX > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > + > + /* String must end in second loop vector. */ > +L(vector_x4_end): > + VPTESTN %VMM4, %VMM4, %k0 > + KMOV %k0, %RCX > + KMOV %k2, %RAX > + BLSMSK %RCX, %RCX > + /* Check if it has match in second loop vector. */ > + and %RCX, %RAX > + jz L(check_last_match) > + > + BSR %RAX, %RAX > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + /* String end in first loop vector. */ > +L(vector_x3_end): > + KMOV %k1, %RAX > + BLSMSK %RCX, %RCX > + /* Check if it has match in second loop vector. */ > + and %RCX, %RAX > + jz L(check_last_match) > + > + BSR %RAX, %RAX > + leaq (%rdi, %rax, CHAR_SIZE), %rax > + ret > + > + /* No match in first and second loop vector. */ > +L(check_last_match): > + /* Check if any match recorded in r8. */ > + test %r8, %r8 > + jz L(vector_x2_ret) > + movq %r8, %rax > + ret > + > + /* No match recorded in r8. Check the second saved vector > + in begining. */ > +L(vector_x2_ret): > + VPCMP $0, %VMM2, %VMM0, %k2 > + KMOV %k2, %RAX > + test %RAX, %RAX > + jz L(vector_x1_ret) > + > + /* Match found in the second saved vector. */ > + BSR %RAX, %RAX > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > + ret > + > +L(page_cross): > + movl %eax, %ecx > +# ifdef USE_AS_WCSRCHR > + /* Calculate number of compare result bits to be skipped for > + wide string alignment adjustment. */ > + andl $(VEC_SIZE - 1), %ecx > + sarl $2, %ecx > +# endif > + /* ecx contains number of w[char] to be skipped as a result > + of address alignment. */ > + xorq %rdi, %rax > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > + > + VPTESTN %VMM1, %VMM1, %k1 > + KMOV %k1, %RAX > + SHR %cl, %RAX > + jz L(page_cross_continue) > + VPCMP $0, %VMM1, %VMM0, %k0 > + KMOV %k0, %RDX > + SHR %cl, %RDX > + BLSMSK %RAX, %RAX > + and %RDX, %RAX > + jz L(ret) > + BSR %RAX, %RAX > +# ifdef USE_AS_WCSRCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > + > + ret > +END (STRRCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > new file mode 100644 > index 0000000000..f880848e09 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > @@ -0,0 +1,7 @@ > +# ifndef STRRCHR > +# define STRRCHR __strrchr_evex512 > +# endif > + > +#define VEC_SIZE 64 > + > +#include "strrchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > new file mode 100644 > index 0000000000..65b7710b22 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WCSRCHR > +# define WCSRCHR __wcsrchr_evex512 > +#endif > + > +#define STRRCHR WCSRCHR > +#define USE_AS_WCSRCHR 1 > + > +#include "strrchr-evex512.S" > -- > 2.36.1 >
Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > > > Please attach benchmark numbers. > > > - strrchr function using 512 bit vectors. > > - wcsrchr function using 512 bit vectors. > > > > Code size data: > > > > strrchr-evex.o 833 byte > > strrchr-evex512.o 573 byte (-31%) > > > > wcsrchr-evex.o 836 byte > > wcsrchr-evex512.o 581 byte (-31%) > > > > Placeholder function, not used by any processor at the moment. > > --- > > sysdeps/x86_64/multiarch/Makefile | 2 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > > 5 files changed, 334 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..6a275f1c3d 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -110,6 +110,7 @@ sysdep_routines += \ > > strrchr-avx2 \ > > strrchr-avx2-rtm \ > > strrchr-evex \ > > + strrchr-evex512 \ > > strrchr-sse2 \ > > strspn-sse4 \ > > strstr-avx512 \ > > @@ -152,6 +153,7 @@ sysdep_routines += \ > > wcsrchr-avx2 \ > > wcsrchr-avx2-rtm \ > > wcsrchr-evex \ > > + wcsrchr-evex512 \ > > wcsrchr-sse2 \ > > wmemchr-avx2 \ > > wmemchr-avx2-rtm \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index a71444eccb..26c941023a 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > (CPU_FEATURE_USABLE (AVX512VL) > > && CPU_FEATURE_USABLE (AVX512BW)), > > __strrchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __strrchr_evex512) > > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > > CPU_FEATURE_USABLE (AVX2), > > __strrchr_avx2) > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wcsrchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __wcsrchr_evex512) > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > > CPU_FEATURE_USABLE (AVX2), > > __wcsrchr_avx2) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > new file mode 100644 > > index 0000000000..e937cb193c > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > @@ -0,0 +1,307 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WCSRCHR > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMP vpcmpd > > +# define VPMINU vpminud > > +# define VPTESTN vptestnmd > > +# else > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMP vpcmpb > > +# define VPMINU vpminub > > +# define VPTESTN vptestnmb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > +# if VEC_SIZE == 64 > > +# define BLSMSK blsmskq > > +# define BSR bsrq > > +# define KMOV kmovq > > +# define KOR korq > > +# define KORTEST kortestq > > +# define R8 r8 > > +# define RAX rax > > +# define RCX rcx > > +# define RDX rdx > > +# define SHR shrq > > +# define TEXTSUFFIX evex512 > > +# define VMM0 zmm16 > > +# define VMM1 zmm17 > > +# define VMM2 zmm18 > > +# define VMM3 zmm19 > > +# define VMM4 zmm20 > > +# define VMM5 zmm21 > > +# define VMOVA vmovdqa64 > > +# define VMOVU vmovdqu64 > > + > > +# elif VEC_SIZE == 32 > > +/* Currently Unused. */ > > +# define BLSMSK blsmskl > > +# define BSR bsrl > > +# define KMOV kmovd > > +# define KOR kord > > +# define KORTEST kortestd > > +# define R8 r8d > > +# define RAX eax > > +# define RCX ecx > > +# define RDX edx > > +# define SHR shrl > > +# define TEXTSUFFIX evex256 > > +# define VMM0 ymm16 > > +# define VMM1 ymm17 > > +# define VMM2 ymm18 > > +# define VMM3 ymm19 > > +# define VMM4 ymm20 > > +# define VMM5 ymm21 > > +# define VMOVA vmovdqa32 > > +# define VMOVU vmovdqu32 > > +# endif > > + > > + .section .text.TEXTSUFFIX, "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (STRRCHR, 6) > > + > > + /* Broadcast CHAR to VMM0. */ > > + VPBROADCAST %esi, %VMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > +L(page_cross_continue): > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VMOVU (%rdi), %VMM1 > > + > > + VPTESTN %VMM1, %VMM1, %k1 > > + KMOV %k1, %RCX > > + test %RCX, %RCX > > + jz L(align_more) > > + > > + VPCMP $0, %VMM1, %VMM0, %k0 > > + KMOV %k0, %RAX > > + BLSMSK %RCX, %RCX > > + and %RCX, %RAX > > + jz L(ret) > > + > > + BSR %RAX, %RAX > > +# ifdef USE_AS_WCSRCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > +L(ret): > > + ret > > + > > +L(vector_x2_end): > > + VPCMP $0, %VMM2, %VMM0, %k2 > > + KMOV %k2, %RAX > > + BLSMSK %RCX, %RCX > > + and %RCX, %RAX > > + jz L(vector_x1_ret) > > + > > + BSR %RAX, %RAX > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + /* Check the first vector at very last to look for match. */ > > +L(vector_x1_ret): > > + VPCMP $0, %VMM1, %VMM0, %k2 > > + KMOV %k2, %RAX > > + test %RAX, %RAX > > + jz L(ret) > > + > > + BSR %RAX, %RAX > > +# ifdef USE_AS_WCSRCHR > > + leaq (%rsi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rsi, %rax > > +# endif > > + ret > > + > > +L(align_more): > > + /* Zero r8 to store match result. */ > > + xorq %r8, %r8 > > + /* Save pointer of first vector, in case if no match found. */ > > + movq %rdi, %rsi > > + /* Align pointer to vector size. */ > > + andq $-VEC_SIZE, %rdi > > + /* Loop unroll 2 times for 2 vector loop. */ > > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(vector_x2_end) > > + > > + /* Save pointer of second vector, in case if no match > > + found. */ > > + movq %rdi, %r9 > > + /* Align address to VEC_SIZE * 2 for loop. */ > > + andq $-(VEC_SIZE * 2), %rdi > > + > > + .p2align 4,,11 > > +L(loop): > > + /* 2 vector loop, as it provide better performance as compared > > + to 4 vector loop. */ > > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > > + VPCMP $0, %VMM3, %VMM0, %k1 > > + VPCMP $0, %VMM4, %VMM0, %k2 > > + VPMINU %VMM3, %VMM4, %VMM5 > > + VPTESTN %VMM5, %VMM5, %k0 > > + KOR %k1, %k2, %k3 > > + subq $-(VEC_SIZE * 2), %rdi > > + /* If k0 and k3 zero, match and end of string not found. */ > > + KORTEST %k0, %k3 > > + jz L(loop) > > + > > + /* If k0 is non zero, end of string found. */ > > + KORTEST %k0, %k0 > > + jnz L(endloop) > > + > > + /* A match found, it need to be stored in r8 before loop > > + continue. */ > > + /* Check second vector first. */ > > + KMOV %k2, %RDX > > + test %RDX, %RDX > > + jz L(loop_vec_x3_ret) > > + > > + BSR %RDX, %RDX > > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > > + jmp L(loop) > > + > > + /* If second vector doesn't have match, first vector must > > + have match. */ > > +L(loop_vec_x3_ret): > > + KMOV %k1, %R8 > > + BSR %R8, %R8 > > +# ifdef USE_AS_WCSRCHR > > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > > +# else > > + add %rdi, %r8 > > +# endif > > + jmp L(loop) > > + > > +L(endloop): > > + /* Check if string end in first loop vector. */ > > + VPTESTN %VMM3, %VMM3, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(vector_x3_end) > > + > > + /* Check if it has match in first loop vector. */ > > + KMOV %k1, %RAX > > + test %RAX, %RAX > > + jz L(vector_x4_end) > > + > > + BSR %RAX, %RAX > > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > > + > > + /* String must end in second loop vector. */ > > +L(vector_x4_end): > > + VPTESTN %VMM4, %VMM4, %k0 > > + KMOV %k0, %RCX > > + KMOV %k2, %RAX > > + BLSMSK %RCX, %RCX > > + /* Check if it has match in second loop vector. */ > > + and %RCX, %RAX > > + jz L(check_last_match) > > + > > + BSR %RAX, %RAX > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + /* String end in first loop vector. */ > > +L(vector_x3_end): > > + KMOV %k1, %RAX > > + BLSMSK %RCX, %RCX > > + /* Check if it has match in second loop vector. */ > > + and %RCX, %RAX > > + jz L(check_last_match) > > + > > + BSR %RAX, %RAX > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > + ret > > + > > + /* No match in first and second loop vector. */ > > +L(check_last_match): > > + /* Check if any match recorded in r8. */ > > + test %r8, %r8 > > + jz L(vector_x2_ret) > > + movq %r8, %rax > > + ret > > + > > + /* No match recorded in r8. Check the second saved vector > > + in begining. */ > > +L(vector_x2_ret): > > + VPCMP $0, %VMM2, %VMM0, %k2 > > + KMOV %k2, %RAX > > + test %RAX, %RAX > > + jz L(vector_x1_ret) > > + > > + /* Match found in the second saved vector. */ > > + BSR %RAX, %RAX > > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > > + ret > > + > > +L(page_cross): > > + movl %eax, %ecx > > +# ifdef USE_AS_WCSRCHR > > + /* Calculate number of compare result bits to be skipped for > > + wide string alignment adjustment. */ > > + andl $(VEC_SIZE - 1), %ecx > > + sarl $2, %ecx > > +# endif > > + /* ecx contains number of w[char] to be skipped as a result > > + of address alignment. */ > > + xorq %rdi, %rax > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > + > > + VPTESTN %VMM1, %VMM1, %k1 > > + KMOV %k1, %RAX > > + SHR %cl, %RAX > > + jz L(page_cross_continue) > > + VPCMP $0, %VMM1, %VMM0, %k0 > > + KMOV %k0, %RDX > > + SHR %cl, %RDX > > + BLSMSK %RAX, %RAX > > + and %RDX, %RAX > > + jz L(ret) > > + BSR %RAX, %RAX > > +# ifdef USE_AS_WCSRCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > + > > + ret > > +END (STRRCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > new file mode 100644 > > index 0000000000..f880848e09 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > @@ -0,0 +1,7 @@ > > +# ifndef STRRCHR > > +# define STRRCHR __strrchr_evex512 > > +# endif > > + > > +#define VEC_SIZE 64 > > + > > +#include "strrchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > new file mode 100644 > > index 0000000000..65b7710b22 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > @@ -0,0 +1,8 @@ > > +#ifndef WCSRCHR > > +# define WCSRCHR __wcsrchr_evex512 > > +#endif > > + > > +#define STRRCHR WCSRCHR > > +#define USE_AS_WCSRCHR 1 > > + > > +#include "strrchr-evex512.S" > > -- > > 2.36.1 > > Function: wcsrchr Variant: __wcsrchr_evex __wcsrchr_evex512 ======================================================================================================================== len=256, pos=64, align=1: 21.29 16.97 ( 20.28%) len=256, pos=64, align=1: 22.62 15.73 ( 30.45%) len=256, pos=64, align=15: 21.59 16.63 ( 22.97%) len=256, pos=64, align=15: 21.23 17.50 ( 17.57%) len=256, pos=64, align=2: 20.73 14.58 ( 29.65%) len=256, pos=64, align=2: 19.78 15.80 ( 20.10%) len=256, pos=64, align=30: 20.09 17.02 ( 15.29%) len=256, pos=64, align=30: 19.06 16.43 ( 13.81%) len=256, pos=64, align=3: 18.44 14.15 ( 23.24%) len=256, pos=64, align=3: 16.87 11.87 ( 29.62%) len=256, pos=64, align=45: 17.03 13.87 ( 18.58%) len=256, pos=64, align=45: 17.36 14.59 ( 15.96%) len=256, pos=64, align=4: 17.64 13.28 ( 24.72%) len=256, pos=64, align=4: 17.13 12.94 ( 24.44%) len=256, pos=64, align=60: 17.21 14.14 ( 17.82%) len=256, pos=64, align=60: 16.67 12.99 ( 22.09%) len=256, pos=64, align=5: 16.32 13.24 ( 18.86%) len=256, pos=64, align=5: 15.09 12.53 ( 16.95%) len=256, pos=64, align=75: 15.11 12.34 ( 18.32%) len=256, pos=64, align=75: 14.67 12.00 ( 18.19%) len=256, pos=64, align=6: 14.67 10.22 ( 30.30%) len=256, pos=64, align=6: 15.11 11.95 ( 20.90%) len=256, pos=64, align=90: 14.85 13.65 ( 8.09%) len=256, pos=64, align=90: 14.50 13.53 ( 6.70%) len=256, pos=64, align=7: 14.53 12.00 ( 17.40%) len=256, pos=64, align=7: 14.53 12.00 ( 17.41%) len=256, pos=64, align=105: 14.67 10.22 ( 30.30%) len=256, pos=64, align=105: 16.23 11.89 ( 26.74%) len=1, pos=0, align=0: 4.67 4.67 ( 0.03%) len=1, pos=0, align=0: 4.67 5.33 (-14.27%) len=2, pos=1, align=0: 4.67 5.28 (-13.12%) len=2, pos=1, align=0: 4.67 5.33 (-14.23%) len=3, pos=2, align=0: 5.33 5.33 ( -0.00%) len=3, pos=2, align=0: 4.67 4.67 ( 0.00%) len=4, pos=3, align=0: 5.33 5.33 ( 0.00%) len=4, pos=3, align=0: 4.67 5.07 ( -8.66%) len=5, pos=4, align=0: 4.67 5.33 (-14.23%) len=5, pos=4, align=0: 4.67 5.33 (-14.23%) len=6, pos=5, align=0: 4.67 4.67 ( 0.04%) len=6, pos=5, align=0: 4.67 5.33 (-14.22%) len=7, pos=6, align=0: 4.67 5.33 (-14.22%) len=7, pos=6, align=0: 4.67 5.33 (-14.23%) len=8, pos=7, align=0: 4.67 4.67 ( 0.01%) len=8, pos=7, align=0: 4.67 4.67 ( 0.05%) len=9, pos=8, align=0: 8.00 4.67 ( 41.67%) len=9, pos=8, align=0: 8.00 5.33 ( 33.33%) len=10, pos=9, align=0: 8.00 4.67 ( 41.64%) len=10, pos=9, align=0: 7.33 5.33 ( 27.27%) len=11, pos=10, align=0: 8.00 4.67 ( 41.66%) len=11, pos=10, align=0: 8.17 5.33 ( 34.71%) len=12, pos=11, align=0: 8.00 5.33 ( 33.33%) len=12, pos=11, align=0: 8.00 4.67 ( 41.66%) len=13, pos=12, align=0: 8.00 4.67 ( 41.66%) len=13, pos=12, align=0: 8.00 4.67 ( 41.64%) len=14, pos=13, align=0: 8.00 4.67 ( 41.64%) len=14, pos=13, align=0: 8.00 5.33 ( 33.33%) len=15, pos=14, align=0: 8.00 4.67 ( 41.66%) len=15, pos=14, align=0: 8.00 4.67 ( 41.64%) len=16, pos=15, align=0: 7.33 4.67 ( 36.34%) len=16, pos=15, align=0: 8.00 4.67 ( 41.66%) len=17, pos=16, align=0: 6.67 6.01 ( 9.98%) len=17, pos=16, align=0: 6.67 6.00 ( 10.00%) len=18, pos=17, align=0: 6.67 6.00 ( 9.99%) len=18, pos=17, align=0: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 6.00 6.00 ( -0.02%) len=19, pos=18, align=0: 6.67 6.00 ( 10.00%) len=20, pos=19, align=0: 6.67 6.00 ( 10.00%) len=20, pos=19, align=0: 6.67 6.00 ( 10.03%) len=21, pos=20, align=0: 6.67 6.00 ( 10.03%) len=21, pos=20, align=0: 6.67 6.17 ( 7.49%) len=22, pos=21, align=0: 6.67 6.00 ( 10.00%) len=22, pos=21, align=0: 6.67 6.00 ( 10.02%) len=23, pos=22, align=0: 6.67 6.00 ( 10.01%) len=23, pos=22, align=0: 6.67 6.00 ( 10.00%) len=24, pos=23, align=0: 6.67 6.17 ( 7.50%) len=24, pos=23, align=0: 6.67 6.00 ( 10.03%) len=25, pos=24, align=0: 8.00 6.00 ( 25.03%) len=25, pos=24, align=0: 8.67 6.00 ( 30.75%) len=26, pos=25, align=0: 8.67 6.00 ( 30.77%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 24.98%) len=27, pos=26, align=0: 8.00 6.00 ( 24.97%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 24.99%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.67 6.00 ( 30.74%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 24.98%) len=32, pos=31, align=0: 8.67 6.00 ( 30.77%) len=32, pos=31, align=0: 8.00 6.00 ( 25.01%) len=256, pos=64, align=1: 35.05 27.10 ( 22.68%) len=256, pos=64, align=1: 35.09 27.05 ( 22.92%) len=256, pos=64, align=15: 33.65 27.42 ( 18.51%) len=256, pos=64, align=15: 34.41 26.49 ( 23.01%) len=256, pos=64, align=2: 35.11 27.10 ( 22.83%) len=256, pos=64, align=2: 34.71 26.18 ( 24.58%) len=256, pos=64, align=30: 35.01 26.04 ( 25.62%) len=256, pos=64, align=30: 34.90 25.79 ( 26.10%) len=256, pos=64, align=3: 33.87 26.38 ( 22.10%) len=256, pos=64, align=3: 33.96 26.76 ( 21.18%) len=256, pos=64, align=45: 34.63 26.45 ( 23.62%) len=256, pos=64, align=45: 33.97 26.06 ( 23.27%) len=256, pos=64, align=4: 34.63 26.34 ( 23.95%) len=256, pos=64, align=4: 34.74 27.33 ( 21.33%) len=256, pos=64, align=60: 33.84 25.74 ( 23.92%) len=256, pos=64, align=60: 35.60 28.81 ( 19.07%) len=256, pos=64, align=5: 35.25 27.10 ( 23.11%) len=256, pos=64, align=5: 34.73 26.43 ( 23.91%) len=256, pos=64, align=75: 33.96 26.59 ( 21.69%) len=256, pos=64, align=75: 35.08 26.48 ( 24.52%) len=256, pos=64, align=6: 36.00 26.15 ( 27.38%) len=256, pos=64, align=6: 34.76 26.63 ( 23.38%) len=256, pos=64, align=90: 34.88 27.15 ( 22.17%) len=256, pos=64, align=90: 34.69 27.05 ( 22.00%) len=256, pos=64, align=7: 33.85 26.29 ( 22.32%) len=256, pos=64, align=7: 35.03 27.08 ( 22.70%) len=256, pos=64, align=105: 35.22 26.11 ( 25.87%) len=256, pos=64, align=105: 34.91 27.23 ( 21.99%) len=1, pos=0, align=0: 5.19 6.47 (-24.54%) len=1, pos=0, align=0: 4.67 5.33 (-14.24%) len=2, pos=1, align=0: 4.83 4.67 ( 3.42%) len=2, pos=1, align=0: 4.67 4.67 ( 0.00%) len=3, pos=2, align=0: 5.33 5.33 ( -0.00%) len=3, pos=2, align=0: 4.67 4.67 ( 0.04%) len=4, pos=3, align=0: 4.67 5.33 (-14.23%) len=4, pos=3, align=0: 4.67 5.33 (-14.24%) len=5, pos=4, align=0: 4.67 5.33 (-14.23%) len=5, pos=4, align=0: 4.67 4.67 ( 0.03%) len=6, pos=5, align=0: 4.67 4.67 ( 0.03%) len=6, pos=5, align=0: 4.67 4.67 ( 0.04%) len=7, pos=6, align=0: 4.67 5.33 (-14.23%) len=7, pos=6, align=0: 4.67 5.33 (-14.24%) len=8, pos=7, align=0: 7.34 5.33 ( 27.32%) len=8, pos=7, align=0: 7.33 5.33 ( 27.28%) len=9, pos=8, align=0: 8.00 4.67 ( 41.66%) len=9, pos=8, align=0: 8.00 5.33 ( 33.33%) len=10, pos=9, align=0: 8.00 5.39 ( 32.61%) len=10, pos=9, align=0: 8.00 4.67 ( 41.66%) len=11, pos=10, align=0: 8.16 5.33 ( 34.66%) len=11, pos=10, align=0: 8.00 5.33 ( 33.33%) len=12, pos=11, align=0: 8.00 4.67 ( 41.64%) len=12, pos=11, align=0: 8.00 5.33 ( 33.33%) len=13, pos=12, align=0: 8.00 4.67 ( 41.65%) len=13, pos=12, align=0: 7.33 5.33 ( 27.26%) len=14, pos=13, align=0: 8.00 5.33 ( 33.33%) len=14, pos=13, align=0: 8.00 5.33 ( 33.33%) len=15, pos=14, align=0: 8.00 4.67 ( 41.64%) len=15, pos=14, align=0: 8.00 4.67 ( 41.66%) len=16, pos=15, align=0: 8.67 6.89 ( 20.50%) len=16, pos=15, align=0: 8.67 6.67 ( 23.06%) len=17, pos=16, align=0: 6.85 6.00 ( 12.31%) len=17, pos=16, align=0: 6.00 6.00 ( 0.04%) len=18, pos=17, align=0: 6.00 6.00 ( 0.02%) len=18, pos=17, align=0: 6.67 6.00 ( 10.03%) len=19, pos=18, align=0: 6.67 6.00 ( 9.97%) len=19, pos=18, align=0: 6.67 6.00 ( 9.99%) len=20, pos=19, align=0: 6.67 6.00 ( 10.02%) len=20, pos=19, align=0: 6.67 6.00 ( 10.02%) len=21, pos=20, align=0: 6.35 6.35 ( 0.01%) len=21, pos=20, align=0: 7.06 6.79 ( 3.87%) len=22, pos=21, align=0: 6.67 6.17 ( 7.56%) len=22, pos=21, align=0: 6.67 6.00 ( 10.02%) len=23, pos=22, align=0: 6.67 6.00 ( 10.03%) len=23, pos=22, align=0: 6.00 6.00 ( 0.02%) len=24, pos=23, align=0: 8.67 6.00 ( 30.80%) len=24, pos=23, align=0: 9.34 6.00 ( 35.72%) len=25, pos=24, align=0: 8.00 6.00 ( 24.99%) len=25, pos=24, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 24.99%) len=27, pos=26, align=0: 8.94 6.00 ( 32.88%) len=28, pos=27, align=0: 8.00 6.00 ( 24.99%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=32, pos=31, align=0: 11.34 10.01 ( 11.75%) len=32, pos=31, align=0: 11.34 10.00 ( 11.77%) len=256, pos=64, align=1: 35.77 29.33 ( 18.01%) len=256, pos=64, align=1: 35.59 29.50 ( 17.12%) len=256, pos=64, align=15: 35.83 29.30 ( 18.20%) len=256, pos=64, align=15: 35.58 29.17 ( 18.02%) len=256, pos=64, align=2: 35.72 29.50 ( 17.42%) len=256, pos=64, align=2: 35.59 29.33 ( 17.58%) len=256, pos=64, align=30: 35.72 29.16 ( 18.37%) len=256, pos=64, align=30: 35.76 26.18 ( 26.79%) len=256, pos=64, align=3: 35.41 29.44 ( 16.86%) len=256, pos=64, align=3: 36.34 29.30 ( 19.38%) len=256, pos=64, align=45: 35.98 28.28 ( 21.38%) len=256, pos=64, align=45: 35.87 29.65 ( 17.34%) len=256, pos=64, align=4: 35.74 29.37 ( 17.83%) len=256, pos=64, align=4: 35.72 31.72 ( 11.21%) len=256, pos=64, align=60: 36.87 26.12 ( 29.16%) len=256, pos=64, align=60: 35.67 26.67 ( 25.24%) len=256, pos=64, align=5: 35.89 29.31 ( 18.35%) len=256, pos=64, align=5: 35.60 29.50 ( 17.14%) len=256, pos=64, align=75: 37.68 30.98 ( 17.78%) len=256, pos=64, align=75: 35.70 28.69 ( 19.62%) len=256, pos=64, align=6: 36.02 29.06 ( 19.33%) len=256, pos=64, align=6: 35.73 29.21 ( 18.23%) len=256, pos=64, align=90: 36.28 28.19 ( 22.30%) len=256, pos=64, align=90: 36.05 26.19 ( 27.35%) len=256, pos=64, align=7: 35.75 28.19 ( 21.13%) len=256, pos=64, align=7: 35.80 29.34 ( 18.04%) len=256, pos=64, align=105: 36.05 28.68 ( 20.44%) len=256, pos=64, align=105: 35.85 29.14 ( 18.72%) len=1, pos=0, align=0: 4.54 6.40 (-40.85%) len=1, pos=0, align=0: 4.67 4.67 ( 0.03%) len=2, pos=1, align=0: 5.33 4.67 ( 12.46%) len=2, pos=1, align=0: 4.67 4.83 ( -3.51%) len=3, pos=2, align=0: 4.67 4.67 ( -0.04%) len=3, pos=2, align=0: 4.67 4.67 ( -0.01%) len=4, pos=3, align=0: 4.67 5.33 (-14.26%) len=4, pos=3, align=0: 4.67 5.33 (-14.30%) len=5, pos=4, align=0: 4.67 5.33 (-14.25%) len=5, pos=4, align=0: 4.67 4.67 ( 0.03%) len=6, pos=5, align=0: 4.94 5.65 (-14.22%) len=6, pos=5, align=0: 4.94 4.94 ( 0.00%) len=7, pos=6, align=0: 4.67 4.67 ( 0.03%) len=7, pos=6, align=0: 4.67 4.67 ( 0.00%) len=8, pos=7, align=0: 7.34 5.33 ( 27.31%) len=8, pos=7, align=0: 8.00 5.33 ( 33.33%) len=9, pos=8, align=0: 8.00 4.67 ( 41.66%) len=9, pos=8, align=0: 7.33 4.67 ( 36.34%) len=10, pos=9, align=0: 7.33 5.33 ( 27.27%) len=10, pos=9, align=0: 8.16 5.33 ( 34.67%) len=11, pos=10, align=0: 8.00 5.33 ( 33.34%) len=11, pos=10, align=0: 8.00 5.33 ( 33.33%) len=12, pos=11, align=0: 8.00 4.67 ( 41.64%) len=12, pos=11, align=0: 8.00 4.67 ( 41.64%) len=13, pos=12, align=0: 8.16 5.33 ( 34.67%) len=13, pos=12, align=0: 7.33 4.67 ( 36.36%) len=14, pos=13, align=0: 8.00 4.67 ( 41.64%) len=14, pos=13, align=0: 8.16 5.33 ( 34.65%) len=15, pos=14, align=0: 8.00 5.33 ( 33.33%) len=15, pos=14, align=0: 8.00 4.67 ( 41.64%) len=16, pos=15, align=0: 8.45 6.89 ( 18.45%) len=16, pos=15, align=0: 8.67 6.67 ( 23.06%) len=17, pos=16, align=0: 7.34 6.00 ( 18.17%) len=17, pos=16, align=0: 6.67 6.00 ( 9.99%) len=18, pos=17, align=0: 6.00 6.00 ( 0.02%) len=18, pos=17, align=0: 6.00 6.00 ( 0.03%) len=19, pos=18, align=0: 6.56 6.00 ( 8.58%) len=19, pos=18, align=0: 6.67 6.00 ( 10.03%) len=20, pos=19, align=0: 6.67 6.14 ( 7.90%) len=20, pos=19, align=0: 6.67 6.00 ( 10.01%) len=21, pos=20, align=0: 6.67 6.00 ( 10.03%) len=21, pos=20, align=0: 6.67 6.00 ( 10.01%) len=22, pos=21, align=0: 6.67 6.00 ( 10.01%) len=22, pos=21, align=0: 6.76 6.00 ( 11.25%) len=23, pos=22, align=0: 6.67 6.00 ( 10.00%) len=23, pos=22, align=0: 6.67 6.00 ( 10.03%) len=24, pos=23, align=0: 8.67 6.00 ( 30.80%) len=24, pos=23, align=0: 9.34 6.00 ( 35.72%) len=25, pos=24, align=0: 8.00 6.00 ( 25.02%) len=25, pos=24, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 24.98%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.00%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=32, pos=31, align=0: 12.67 10.01 ( 21.04%) len=32, pos=31, align=0: 12.00 10.00 ( 16.67%) len=256, pos=64, align=1: 35.59 30.41 ( 14.58%) len=256, pos=64, align=1: 35.40 29.31 ( 17.18%) len=256, pos=64, align=15: 35.64 27.78 ( 22.06%) len=256, pos=64, align=15: 35.32 30.57 ( 13.44%) len=256, pos=64, align=2: 35.83 30.82 ( 13.98%) len=256, pos=64, align=2: 35.37 29.08 ( 17.79%) len=256, pos=64, align=30: 36.50 26.16 ( 28.32%) len=256, pos=64, align=30: 36.09 26.28 ( 27.18%) len=256, pos=64, align=3: 35.82 28.42 ( 20.67%) len=256, pos=64, align=3: 34.95 27.73 ( 20.66%) len=256, pos=64, align=45: 35.65 27.80 ( 22.01%) len=256, pos=64, align=45: 35.44 28.46 ( 19.70%) len=256, pos=64, align=4: 35.34 30.41 ( 13.97%) len=256, pos=64, align=4: 35.60 27.78 ( 21.97%) len=256, pos=64, align=60: 35.43 27.70 ( 21.83%) len=256, pos=64, align=60: 36.41 27.75 ( 23.79%) len=256, pos=64, align=5: 35.44 30.23 ( 14.71%) len=256, pos=64, align=5: 36.77 30.43 ( 17.25%) len=256, pos=64, align=75: 35.31 27.94 ( 20.88%) len=256, pos=64, align=75: 35.76 28.37 ( 20.67%) len=256, pos=64, align=6: 35.38 28.28 ( 20.08%) len=256, pos=64, align=6: 35.69 27.78 ( 22.17%) len=256, pos=64, align=90: 36.20 27.77 ( 23.29%) len=256, pos=64, align=90: 36.49 29.13 ( 20.18%) len=256, pos=64, align=7: 35.43 27.78 ( 21.60%) len=256, pos=64, align=7: 37.20 29.20 ( 21.52%) len=256, pos=64, align=105: 35.57 27.73 ( 22.06%) len=256, pos=64, align=105: 36.33 28.43 ( 21.73%) len=1, pos=0, align=0: 4.54 6.39 (-40.72%) len=1, pos=0, align=0: 4.67 4.67 ( 0.04%) len=2, pos=1, align=0: 5.33 5.33 ( -0.01%) len=2, pos=1, align=0: 4.67 5.33 (-14.29%) len=3, pos=2, align=0: 4.67 4.67 ( 0.03%) len=3, pos=2, align=0: 4.67 4.67 ( 0.04%) len=4, pos=3, align=0: 4.67 4.67 ( 0.03%) len=4, pos=3, align=0: 4.67 4.67 ( 0.00%) len=5, pos=4, align=0: 4.67 5.33 (-14.23%) len=5, pos=4, align=0: 4.67 4.67 ( 0.04%) len=6, pos=5, align=0: 4.67 4.67 ( 0.00%) len=6, pos=5, align=0: 4.67 5.33 (-14.24%) len=7, pos=6, align=0: 4.90 5.33 ( -8.94%) len=7, pos=6, align=0: 4.67 5.33 (-14.28%) len=8, pos=7, align=0: 7.51 5.33 ( 29.00%) len=8, pos=7, align=0: 8.00 5.33 ( 33.33%) len=9, pos=8, align=0: 8.00 5.33 ( 33.35%) len=9, pos=8, align=0: 8.00 4.67 ( 41.66%) len=10, pos=9, align=0: 8.00 5.33 ( 33.33%) len=10, pos=9, align=0: 8.00 5.33 ( 33.33%) len=11, pos=10, align=0: 8.00 4.67 ( 41.66%) len=11, pos=10, align=0: 8.16 4.67 ( 42.83%) len=12, pos=11, align=0: 8.00 4.67 ( 41.64%) len=12, pos=11, align=0: 8.00 5.33 ( 33.33%) len=13, pos=12, align=0: 7.33 5.33 ( 27.27%) len=13, pos=12, align=0: 8.00 4.67 ( 41.64%) len=14, pos=13, align=0: 8.00 4.67 ( 41.64%) len=14, pos=13, align=0: 8.00 4.67 ( 41.66%) len=15, pos=14, align=0: 7.33 4.67 ( 36.36%) len=15, pos=14, align=0: 8.00 4.67 ( 41.64%) len=16, pos=15, align=0: 8.67 6.89 ( 20.50%) len=16, pos=15, align=0: 8.67 6.83 ( 21.15%) len=17, pos=16, align=0: 6.67 6.00 ( 10.01%) len=17, pos=16, align=0: 6.00 6.00 ( -0.01%) len=18, pos=17, align=0: 6.67 6.16 ( 7.60%) len=18, pos=17, align=0: 6.67 6.00 ( 10.03%) len=19, pos=18, align=0: 6.00 6.00 ( 0.02%) len=19, pos=18, align=0: 6.67 6.00 ( 10.03%) len=20, pos=19, align=0: 6.83 6.00 ( 12.16%) len=20, pos=19, align=0: 6.67 6.00 ( 10.00%) len=21, pos=20, align=0: 6.00 6.00 ( 0.05%) len=21, pos=20, align=0: 6.67 6.00 ( 10.03%) len=22, pos=21, align=0: 6.67 6.00 ( 10.03%) len=22, pos=21, align=0: 6.67 6.00 ( 10.00%) len=23, pos=22, align=0: 6.67 6.16 ( 7.62%) len=23, pos=22, align=0: 6.67 6.00 ( 10.00%) len=24, pos=23, align=0: 8.67 6.00 ( 30.80%) len=24, pos=23, align=0: 9.33 6.00 ( 35.71%) len=25, pos=24, align=0: 8.00 6.00 ( 25.00%) len=25, pos=24, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.67 6.00 ( 30.76%) len=27, pos=26, align=0: 8.67 6.00 ( 30.74%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 24.99%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 24.99%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=32, pos=31, align=0: 12.67 10.01 ( 21.05%) len=32, pos=31, align=0: 12.76 10.00 ( 21.63%) len=256, pos=64, align=1: 35.37 29.05 ( 17.87%) len=256, pos=64, align=1: 35.73 28.45 ( 20.38%) len=256, pos=64, align=15: 35.40 29.74 ( 16.00%) len=256, pos=64, align=15: 36.82 29.23 ( 20.62%) len=256, pos=64, align=2: 35.51 27.76 ( 21.82%) len=256, pos=64, align=2: 35.31 27.99 ( 20.72%) len=256, pos=64, align=30: 35.03 27.77 ( 20.72%) len=256, pos=64, align=30: 35.72 27.15 ( 24.00%) len=256, pos=64, align=3: 35.46 30.45 ( 14.12%) len=256, pos=64, align=3: 35.43 27.88 ( 21.31%) len=256, pos=64, align=45: 35.36 27.73 ( 21.58%) len=256, pos=64, align=45: 35.55 27.73 ( 21.99%) len=256, pos=64, align=4: 35.45 27.98 ( 21.07%) len=256, pos=64, align=4: 38.56 30.94 ( 19.77%) len=256, pos=64, align=60: 35.43 29.30 ( 17.29%) len=256, pos=64, align=60: 35.34 28.17 ( 20.29%) len=256, pos=64, align=5: 36.10 30.37 ( 15.88%) len=256, pos=64, align=5: 35.38 28.48 ( 19.51%) len=256, pos=64, align=75: 36.42 27.73 ( 23.87%) len=256, pos=64, align=75: 35.39 27.93 ( 21.10%) len=256, pos=64, align=6: 36.58 30.37 ( 16.97%) len=256, pos=64, align=6: 35.37 28.90 ( 18.31%) len=256, pos=64, align=90: 35.55 26.13 ( 26.50%) len=256, pos=64, align=90: 36.39 29.42 ( 19.14%) len=256, pos=64, align=7: 35.32 29.51 ( 16.44%) len=256, pos=64, align=7: 35.73 27.73 ( 22.41%) len=256, pos=64, align=105: 35.36 30.36 ( 14.14%) len=256, pos=64, align=105: 35.41 27.82 ( 21.42%) len=1, pos=0, align=0: 4.80 6.77 (-41.02%) len=1, pos=0, align=0: 5.65 5.65 ( -0.00%) len=2, pos=1, align=0: 4.94 4.94 ( -0.01%) len=2, pos=1, align=0: 5.65 5.65 ( 0.00%) len=3, pos=2, align=0: 4.94 4.94 ( 0.00%) len=3, pos=2, align=0: 4.94 4.94 ( 0.00%) len=4, pos=3, align=0: 4.94 5.15 ( -4.17%) len=4, pos=3, align=0: 4.94 4.94 ( 0.03%) len=5, pos=4, align=0: 5.16 4.94 ( 4.12%) len=5, pos=4, align=0: 4.94 4.94 ( 0.04%) len=6, pos=5, align=0: 4.94 5.21 ( -5.35%) len=6, pos=5, align=0: 4.94 4.94 ( -0.01%) len=7, pos=6, align=0: 4.94 5.17 ( -4.55%) len=7, pos=6, align=0: 4.94 4.94 ( 0.04%) len=8, pos=7, align=0: 7.97 5.65 ( 29.17%) len=8, pos=7, align=0: 8.47 5.65 ( 33.33%) len=9, pos=8, align=0: 8.47 5.65 ( 33.34%) len=9, pos=8, align=0: 8.47 4.94 ( 41.64%) len=10, pos=9, align=0: 8.47 4.94 ( 41.64%) len=10, pos=9, align=0: 7.97 5.65 ( 29.17%) len=11, pos=10, align=0: 8.47 5.65 ( 33.33%) len=11, pos=10, align=0: 8.47 4.94 ( 41.64%) len=12, pos=11, align=0: 8.47 5.65 ( 33.33%) len=12, pos=11, align=0: 8.47 5.65 ( 33.32%) len=13, pos=12, align=0: 8.47 5.65 ( 33.32%) len=13, pos=12, align=0: 8.47 5.65 ( 33.32%) len=14, pos=13, align=0: 8.47 5.65 ( 33.33%) len=14, pos=13, align=0: 7.77 5.65 ( 27.27%) len=15, pos=14, align=0: 7.77 5.14 ( 33.82%) len=15, pos=14, align=0: 7.77 5.65 ( 27.27%) len=16, pos=15, align=0: 9.21 7.30 ( 20.77%) len=16, pos=15, align=0: 8.70 6.67 ( 23.37%) len=17, pos=16, align=0: 6.95 6.00 ( 13.59%) len=17, pos=16, align=0: 6.67 6.00 ( 10.03%) len=18, pos=17, align=0: 6.00 6.00 ( 0.03%) len=18, pos=17, align=0: 6.67 6.00 ( 10.02%) len=19, pos=18, align=0: 6.67 6.00 ( 10.03%) len=19, pos=18, align=0: 6.67 6.18 ( 7.38%) len=20, pos=19, align=0: 6.67 6.00 ( 10.00%) len=20, pos=19, align=0: 6.67 6.00 ( 10.00%) len=21, pos=20, align=0: 6.00 6.00 ( 0.02%) len=21, pos=20, align=0: 6.67 6.00 ( 10.03%) len=22, pos=21, align=0: 6.67 6.17 ( 7.55%) len=22, pos=21, align=0: 6.67 6.00 ( 10.00%) len=23, pos=22, align=0: 6.67 6.00 ( 10.03%) len=23, pos=22, align=0: 6.67 6.00 ( 10.03%) len=24, pos=23, align=0: 8.67 6.00 ( 30.79%) len=24, pos=23, align=0: 8.67 6.00 ( 30.79%) len=25, pos=24, align=0: 8.00 6.00 ( 25.02%) len=25, pos=24, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 24.98%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=32, pos=31, align=0: 12.67 10.00 ( 21.04%) len=32, pos=31, align=0: 12.00 10.00 ( 16.65%) len=256, pos=64, align=1: 35.59 30.38 ( 14.63%) len=256, pos=64, align=1: 35.64 30.35 ( 14.86%) len=256, pos=64, align=15: 35.12 30.56 ( 12.98%) len=256, pos=64, align=15: 36.68 27.83 ( 24.13%) len=256, pos=64, align=2: 35.39 30.60 ( 13.53%) len=256, pos=64, align=2: 36.85 27.83 ( 24.48%) len=256, pos=64, align=30: 35.54 29.10 ( 18.12%) len=256, pos=64, align=30: 36.33 27.73 ( 23.67%) len=256, pos=64, align=3: 35.59 29.07 ( 18.32%) len=256, pos=64, align=3: 35.58 29.68 ( 16.58%) len=256, pos=64, align=45: 35.62 30.36 ( 14.74%) len=256, pos=64, align=45: 35.40 27.93 ( 21.09%) len=256, pos=64, align=4: 35.48 27.78 ( 21.70%) len=256, pos=64, align=4: 35.42 27.94 ( 21.11%) len=256, pos=64, align=60: 35.43 29.13 ( 17.77%) len=256, pos=64, align=60: 36.30 26.18 ( 27.86%) len=256, pos=64, align=5: 35.39 27.73 ( 21.66%) len=256, pos=64, align=5: 35.48 27.80 ( 21.64%) len=256, pos=64, align=75: 34.94 27.92 ( 20.08%) len=256, pos=64, align=75: 36.40 27.82 ( 23.57%) len=256, pos=64, align=6: 35.58 30.01 ( 15.67%) len=256, pos=64, align=6: 35.54 30.36 ( 14.55%) len=256, pos=64, align=90: 36.26 27.06 ( 25.38%) len=256, pos=64, align=90: 35.80 29.05 ( 18.86%) len=256, pos=64, align=7: 36.35 29.73 ( 18.22%) len=256, pos=64, align=7: 36.33 27.74 ( 23.65%) len=256, pos=64, align=105: 35.49 30.41 ( 14.31%) len=256, pos=64, align=105: 35.59 29.50 ( 17.12%) len=1, pos=0, align=0: 4.54 6.47 (-42.30%) len=1, pos=0, align=0: 4.67 4.67 ( 0.00%) len=2, pos=1, align=0: 4.67 4.67 ( 0.04%) len=2, pos=1, align=0: 5.33 4.67 ( 12.47%) len=3, pos=2, align=0: 4.94 4.94 ( 0.04%) len=3, pos=2, align=0: 4.94 4.94 ( 0.00%) len=4, pos=3, align=0: 4.84 5.33 (-10.27%) len=4, pos=3, align=0: 4.67 5.33 (-14.24%) len=5, pos=4, align=0: 4.83 4.67 ( 3.42%) len=5, pos=4, align=0: 4.67 4.67 ( -0.00%) len=6, pos=5, align=0: 4.67 5.30 (-13.40%) len=6, pos=5, align=0: 4.67 5.33 (-14.24%) len=7, pos=6, align=0: 4.67 4.67 ( 0.02%) len=7, pos=6, align=0: 4.67 5.33 (-14.24%) len=8, pos=7, align=0: 7.34 5.33 ( 27.30%) len=8, pos=7, align=0: 8.00 5.33 ( 33.33%) len=9, pos=8, align=0: 8.16 5.33 ( 34.66%) len=9, pos=8, align=0: 8.00 5.33 ( 33.33%) len=10, pos=9, align=0: 8.00 5.33 ( 33.33%) len=10, pos=9, align=0: 8.00 5.33 ( 33.33%) len=11, pos=10, align=0: 8.00 4.67 ( 41.66%) len=11, pos=10, align=0: 7.33 5.33 ( 27.27%) len=12, pos=11, align=0: 8.00 4.67 ( 41.66%) len=12, pos=11, align=0: 8.00 5.33 ( 33.32%) len=13, pos=12, align=0: 8.00 5.33 ( 33.32%) len=13, pos=12, align=0: 8.00 5.33 ( 33.32%) len=14, pos=13, align=0: 8.10 4.67 ( 42.36%) len=14, pos=13, align=0: 7.33 4.67 ( 36.36%) len=15, pos=14, align=0: 7.33 4.67 ( 36.34%) len=15, pos=14, align=0: 8.18 5.33 ( 34.77%) len=16, pos=15, align=0: 8.70 6.67 ( 23.34%) len=16, pos=15, align=0: 8.70 6.89 ( 20.80%) len=17, pos=16, align=0: 6.69 6.00 ( 10.30%) len=17, pos=16, align=0: 6.69 6.00 ( 10.36%) len=18, pos=17, align=0: 6.70 6.00 ( 10.37%) len=18, pos=17, align=0: 6.70 6.00 ( 10.38%) len=19, pos=18, align=0: 6.70 6.00 ( 10.39%) len=19, pos=18, align=0: 6.70 6.22 ( 7.07%) len=20, pos=19, align=0: 6.70 6.00 ( 10.37%) len=20, pos=19, align=0: 6.70 6.00 ( 10.41%) len=21, pos=20, align=0: 6.63 6.00 ( 9.56%) len=21, pos=20, align=0: 6.64 6.00 ( 9.57%) len=22, pos=21, align=0: 6.70 6.16 ( 7.95%) len=22, pos=21, align=0: 6.64 6.00 ( 9.58%) len=23, pos=22, align=0: 6.65 6.00 ( 9.72%) len=23, pos=22, align=0: 6.70 6.00 ( 10.39%) len=24, pos=23, align=0: 8.67 6.00 ( 30.80%) len=24, pos=23, align=0: 8.67 6.00 ( 30.79%) len=25, pos=24, align=0: 8.67 6.00 ( 30.77%) len=25, pos=24, align=0: 8.00 6.00 ( 24.99%) len=26, pos=25, align=0: 8.00 6.00 ( 25.01%) len=26, pos=25, align=0: 8.00 6.00 ( 24.98%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=27, pos=26, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=28, pos=27, align=0: 8.00 6.00 ( 25.01%) len=29, pos=28, align=0: 8.00 6.00 ( 24.99%) len=29, pos=28, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=30, pos=29, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=31, pos=30, align=0: 8.00 6.00 ( 25.01%) len=32, pos=31, align=0: 12.67 10.00 ( 21.05%) len=32, pos=31, align=0: 12.00 10.00 ( 16.66%) Function: strrchr Variant: __strrchr_evex __strrchr_evex512 ======================================================================================================================== len=2048, pos=32, align=0: 9.50 8.16 ( 14.13%) len=2048, pos=32, align=1: 9.00 7.26 ( 19.37%) len=2048, pos=64, align=0: 10.52 10.20 ( 3.00%) len=2048, pos=64, align=2: 10.58 10.02 ( 5.34%) len=2048, pos=128, align=0: 15.96 11.74 ( 26.44%) len=2048, pos=128, align=3: 15.70 11.43 ( 27.23%) len=2048, pos=256, align=0: 17.59 16.30 ( 7.34%) len=2048, pos=256, align=4: 16.54 14.24 ( 13.87%) len=2048, pos=512, align=0: 23.84 16.44 ( 31.03%) len=2048, pos=512, align=5: 23.30 15.14 ( 35.02%) len=2048, pos=1024, align=0: 34.75 24.35 ( 29.93%) len=2048, pos=1024, align=6: 34.59 24.42 ( 29.41%) len=2048, pos=2048, align=0: 62.52 40.61 ( 35.05%) len=2048, pos=2048, align=7: 63.02 40.42 ( 35.87%) len=2048, pos=4096, align=0: 62.52 40.51 ( 35.21%) len=2048, pos=4096, align=8: 62.59 40.44 ( 35.40%) len=256, pos=64, align=1: 7.45 7.67 ( -2.93%) len=256, pos=64, align=1: 7.51 7.73 ( -2.95%) len=256, pos=64, align=15: 6.67 6.00 ( 10.00%) len=256, pos=64, align=15: 7.50 7.90 ( -5.28%) len=256, pos=64, align=2: 6.67 6.00 ( 9.99%) len=256, pos=64, align=2: 6.67 6.00 ( 9.99%) len=256, pos=64, align=30: 7.50 7.88 ( -5.12%) len=256, pos=64, align=30: 6.67 6.00 ( 10.00%) len=256, pos=64, align=3: 6.67 6.00 ( 10.00%) len=256, pos=64, align=3: 7.60 7.84 ( -3.12%) len=256, pos=64, align=45: 6.67 6.00 ( 10.00%) len=256, pos=64, align=45: 6.67 6.00 ( 10.00%) len=256, pos=64, align=4: 7.56 7.66 ( -1.32%) len=256, pos=64, align=4: 6.67 6.00 ( 9.97%) len=256, pos=64, align=60: 7.89 7.67 ( 2.81%) len=256, pos=64, align=60: 7.58 7.73 ( -1.95%) len=256, pos=64, align=5: 6.67 6.00 ( 10.00%) len=256, pos=64, align=5: 7.38 7.65 ( -3.76%) len=256, pos=64, align=75: 7.39 7.71 ( -4.33%) len=256, pos=64, align=75: 6.67 6.00 ( 10.00%) len=256, pos=64, align=6: 7.47 7.90 ( -5.76%) len=256, pos=64, align=6: 6.67 6.00 ( 10.00%) len=256, pos=64, align=90: 6.67 6.00 ( 10.00%) len=256, pos=64, align=90: 7.47 7.72 ( -3.43%) len=256, pos=64, align=7: 6.67 6.00 ( 10.00%) len=256, pos=64, align=7: 6.67 6.00 ( 10.00%) len=256, pos=64, align=105: 7.56 7.70 ( -1.75%) len=256, pos=64, align=105: 6.67 6.00 ( 10.00%) len=1, pos=0, align=0: 4.01 4.67 (-16.61%) len=1, pos=0, align=0: 4.00 4.67 (-16.69%) len=1, pos=0, align=4095: 5.34 5.34 ( 0.03%) len=2, pos=1, align=0: 4.00 4.67 (-16.69%) len=2, pos=1, align=0: 4.00 4.67 (-16.67%) len=2, pos=1, align=4095: 6.67 6.00 ( 10.00%) len=3, pos=2, align=0: 4.00 4.67 (-16.62%) len=3, pos=2, align=0: 4.00 4.67 (-16.70%) len=3, pos=2, align=4094: 6.84 6.00 ( 12.19%) len=4, pos=3, align=0: 4.00 4.68 (-16.94%) len=4, pos=3, align=0: 4.00 4.84 (-20.93%) len=4, pos=3, align=4094: 6.67 6.00 ( 10.00%) len=5, pos=4, align=0: 4.00 4.84 (-20.87%) len=5, pos=4, align=0: 4.00 4.67 (-16.63%) len=5, pos=4, align=4093: 6.67 6.00 ( 9.98%) len=6, pos=5, align=0: 4.00 4.67 (-16.72%) len=6, pos=5, align=0: 4.00 4.67 (-16.66%) len=6, pos=5, align=4093: 6.67 6.00 ( 9.96%) len=7, pos=6, align=0: 4.00 4.67 (-16.64%) len=7, pos=6, align=0: 4.00 4.67 (-16.66%) len=7, pos=6, align=4092: 6.67 6.00 ( 9.99%) len=8, pos=7, align=0: 4.00 4.67 (-16.70%) len=8, pos=7, align=0: 4.00 4.67 (-16.63%) len=8, pos=7, align=4092: 6.67 6.00 ( 10.00%) len=9, pos=8, align=0: 4.00 4.67 (-16.64%) len=9, pos=8, align=0: 4.00 4.67 (-16.65%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.00 4.67 (-16.71%) len=10, pos=9, align=0: 4.00 4.67 (-16.70%) len=10, pos=9, align=4091: 6.84 6.00 ( 12.19%) len=11, pos=10, align=0: 4.00 4.67 (-16.64%) len=11, pos=10, align=0: 4.00 4.67 (-16.70%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.00 4.67 (-16.69%) len=12, pos=11, align=0: 4.00 4.67 (-16.63%) len=12, pos=11, align=4090: 6.67 6.00 ( 9.97%) len=13, pos=12, align=0: 4.00 4.67 (-16.64%) len=13, pos=12, align=0: 4.00 4.67 (-16.65%) len=13, pos=12, align=4089: 6.67 6.00 ( 9.99%) len=14, pos=13, align=0: 4.00 4.67 (-16.67%) len=14, pos=13, align=0: 4.00 4.67 (-16.66%) len=14, pos=13, align=4089: 6.67 6.00 ( 10.00%) len=15, pos=14, align=0: 4.00 4.67 (-16.64%) len=15, pos=14, align=0: 4.00 4.67 (-16.63%) len=15, pos=14, align=4088: 6.67 6.16 ( 7.63%) len=16, pos=15, align=0: 4.00 4.67 (-16.67%) len=16, pos=15, align=0: 4.00 4.67 (-16.66%) len=16, pos=15, align=4088: 6.67 6.00 ( 9.99%) len=17, pos=16, align=0: 4.00 4.68 (-16.99%) len=17, pos=16, align=0: 4.00 4.68 (-16.91%) len=17, pos=16, align=4087: 7.06 6.35 ( 9.99%) len=18, pos=17, align=0: 4.24 4.96 (-17.01%) len=18, pos=17, align=0: 4.00 4.68 (-16.94%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.97%) len=19, pos=18, align=0: 4.00 4.68 (-16.93%) len=19, pos=18, align=0: 4.00 4.68 (-16.97%) len=19, pos=18, align=4086: 6.84 6.00 ( 12.21%) len=20, pos=19, align=0: 4.00 4.68 (-17.00%) len=20, pos=19, align=0: 4.00 4.68 (-17.00%) len=20, pos=19, align=4086: 6.67 6.00 ( 9.99%) len=21, pos=20, align=0: 4.00 4.68 (-16.91%) len=21, pos=20, align=0: 4.00 4.68 (-17.02%) len=21, pos=20, align=4085: 6.67 6.00 ( 10.00%) len=22, pos=21, align=0: 4.00 4.68 (-16.95%) len=22, pos=21, align=0: 4.00 4.84 (-20.93%) len=22, pos=21, align=4085: 6.67 6.00 ( 9.99%) len=23, pos=22, align=0: 4.00 4.68 (-16.95%) len=23, pos=22, align=0: 4.00 4.68 (-16.97%) len=23, pos=22, align=4084: 6.67 6.00 ( 9.97%) len=24, pos=23, align=0: 4.00 4.68 (-16.97%) len=24, pos=23, align=0: 4.00 4.68 (-16.91%) len=24, pos=23, align=4084: 6.67 6.00 ( 9.97%) len=25, pos=24, align=0: 4.00 4.68 (-17.02%) len=25, pos=24, align=0: 4.00 4.68 (-16.91%) len=25, pos=24, align=4083: 6.67 6.00 ( 10.00%) len=26, pos=25, align=0: 4.00 4.68 (-16.98%) len=26, pos=25, align=0: 4.00 4.68 (-16.92%) len=26, pos=25, align=4083: 6.67 6.00 ( 9.99%) len=27, pos=26, align=0: 4.00 4.68 (-17.00%) len=27, pos=26, align=0: 4.00 4.68 (-16.98%) len=27, pos=26, align=4082: 6.67 6.00 ( 9.99%) len=28, pos=27, align=0: 4.00 4.68 (-16.95%) len=28, pos=27, align=0: 4.17 4.68 (-12.18%) len=28, pos=27, align=4082: 6.67 6.00 ( 10.00%) len=29, pos=28, align=0: 4.00 4.68 (-16.91%) len=29, pos=28, align=0: 4.00 4.68 (-16.99%) len=29, pos=28, align=4081: 6.67 6.00 ( 10.00%) len=30, pos=29, align=0: 4.00 4.68 (-16.91%) len=30, pos=29, align=0: 4.00 4.68 (-16.98%) len=30, pos=29, align=4081: 6.84 6.00 ( 12.28%) len=31, pos=30, align=0: 4.00 4.68 (-16.94%) len=31, pos=30, align=0: 4.00 4.68 (-16.92%) len=31, pos=30, align=4080: 6.67 6.00 ( 9.97%) len=32, pos=31, align=0: 4.17 4.68 (-12.23%) len=32, pos=31, align=0: 4.00 4.68 (-16.97%) len=32, pos=31, align=4080: 6.67 6.00 ( 9.99%) len=2048, pos=32, align=0: 87.08 43.87 ( 49.62%) len=2048, pos=32, align=1: 85.90 43.75 ( 49.06%) len=2048, pos=64, align=0: 92.56 46.43 ( 49.84%) len=2048, pos=64, align=2: 85.68 43.81 ( 48.87%) len=2048, pos=128, align=0: 61.45 42.93 ( 30.13%) len=2048, pos=128, align=3: 61.15 43.53 ( 28.82%) len=2048, pos=256, align=0: 64.65 44.51 ( 31.16%) len=2048, pos=256, align=4: 64.00 45.53 ( 28.87%) len=2048, pos=512, align=0: 64.18 45.71 ( 28.78%) len=2048, pos=512, align=5: 61.30 42.45 ( 30.76%) len=2048, pos=1024, align=0: 67.65 47.13 ( 30.33%) len=2048, pos=1024, align=6: 62.01 45.89 ( 26.00%) len=2048, pos=2048, align=0: 60.77 43.90 ( 27.77%) len=2048, pos=2048, align=7: 61.80 43.89 ( 28.98%) len=2048, pos=4096, align=0: 62.35 45.05 ( 27.75%) len=2048, pos=4096, align=8: 61.32 43.86 ( 28.47%) len=256, pos=64, align=1: 13.92 14.40 ( -3.49%) len=256, pos=64, align=1: 14.34 15.56 ( -8.53%) len=256, pos=64, align=15: 13.88 14.48 ( -4.30%) len=256, pos=64, align=15: 14.58 14.48 ( 0.68%) len=256, pos=64, align=2: 14.02 14.62 ( -4.27%) len=256, pos=64, align=2: 13.95 14.53 ( -4.11%) len=256, pos=64, align=30: 13.99 14.53 ( -3.82%) len=256, pos=64, align=30: 13.92 14.35 ( -3.09%) len=256, pos=64, align=3: 14.18 13.96 ( 1.52%) len=256, pos=64, align=3: 13.93 14.56 ( -4.48%) len=256, pos=64, align=45: 13.90 14.62 ( -5.19%) len=256, pos=64, align=45: 14.00 14.70 ( -5.02%) len=256, pos=64, align=4: 13.88 14.50 ( -4.50%) len=256, pos=64, align=4: 14.00 15.59 (-11.30%) len=256, pos=64, align=60: 13.94 14.53 ( -4.21%) len=256, pos=64, align=60: 14.06 14.46 ( -2.83%) len=256, pos=64, align=5: 13.97 14.02 ( -0.38%) len=256, pos=64, align=5: 14.01 14.52 ( -3.64%) len=256, pos=64, align=75: 13.98 14.26 ( -2.00%) len=256, pos=64, align=75: 14.54 13.73 ( 5.55%) len=256, pos=64, align=6: 14.13 14.38 ( -1.76%) len=256, pos=64, align=6: 13.43 14.11 ( -5.05%) len=256, pos=64, align=90: 14.03 14.44 ( -2.92%) len=256, pos=64, align=90: 13.94 14.63 ( -4.97%) len=256, pos=64, align=7: 13.88 13.98 ( -0.73%) len=256, pos=64, align=7: 14.09 14.26 ( -1.20%) len=256, pos=64, align=105: 13.88 13.44 ( 3.17%) len=256, pos=64, align=105: 14.02 13.40 ( 4.43%) len=1, pos=0, align=0: 4.00 4.67 (-16.67%) len=1, pos=0, align=0: 4.00 4.67 (-16.62%) len=1, pos=0, align=4095: 6.67 6.17 ( 7.56%) len=2, pos=1, align=0: 4.00 4.67 (-16.64%) len=2, pos=1, align=0: 4.00 4.67 (-16.67%) len=2, pos=1, align=4095: 6.67 6.00 ( 9.99%) len=3, pos=2, align=0: 4.01 4.68 (-16.65%) len=3, pos=2, align=0: 4.01 4.68 (-16.65%) len=3, pos=2, align=4094: 6.67 6.00 ( 9.97%) len=4, pos=3, align=0: 4.01 4.68 (-16.58%) len=4, pos=3, align=0: 4.01 4.68 (-16.64%) len=4, pos=3, align=4094: 6.85 6.00 ( 12.44%) len=5, pos=4, align=0: 4.01 4.68 (-16.66%) len=5, pos=4, align=0: 4.01 4.84 (-20.72%) len=5, pos=4, align=4093: 6.67 6.00 ( 10.00%) len=6, pos=5, align=0: 4.17 4.68 (-12.20%) len=6, pos=5, align=0: 4.00 4.68 (-16.94%) len=6, pos=5, align=4093: 6.67 6.24 ( 6.35%) len=7, pos=6, align=0: 4.01 4.68 (-16.63%) len=7, pos=6, align=0: 4.01 4.68 (-16.60%) len=7, pos=6, align=4092: 6.67 6.00 ( 9.99%) len=8, pos=7, align=0: 4.01 4.83 (-20.48%) len=8, pos=7, align=0: 4.01 4.67 (-16.30%) len=8, pos=7, align=4092: 6.67 6.00 ( 9.99%) len=9, pos=8, align=0: 4.01 4.68 (-16.66%) len=9, pos=8, align=0: 4.01 4.68 (-16.60%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.01 4.84 (-20.62%) len=10, pos=9, align=0: 4.01 4.68 (-16.61%) len=10, pos=9, align=4091: 6.67 6.18 ( 7.33%) len=11, pos=10, align=0: 4.00 4.68 (-16.91%) len=11, pos=10, align=0: 4.00 4.84 (-20.90%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.17 4.68 (-12.27%) len=12, pos=11, align=0: 4.01 4.68 (-16.57%) len=12, pos=11, align=4090: 6.83 6.00 ( 12.08%) len=13, pos=12, align=0: 4.01 4.68 (-16.64%) len=13, pos=12, align=0: 4.01 4.68 (-16.63%) len=13, pos=12, align=4089: 6.67 6.00 ( 9.97%) len=14, pos=13, align=0: 4.01 4.68 (-16.61%) len=14, pos=13, align=0: 4.01 4.68 (-16.63%) len=14, pos=13, align=4089: 6.67 6.00 ( 9.99%) len=15, pos=14, align=0: 4.01 4.68 (-16.66%) len=15, pos=14, align=0: 4.01 4.68 (-16.63%) len=15, pos=14, align=4088: 6.67 6.17 ( 7.48%) len=16, pos=15, align=0: 4.01 4.68 (-16.62%) len=16, pos=15, align=0: 4.01 4.68 (-16.63%) len=16, pos=15, align=4088: 6.67 6.00 ( 9.99%) len=17, pos=16, align=0: 4.01 4.68 (-16.56%) len=17, pos=16, align=0: 4.01 4.68 (-16.56%) len=17, pos=16, align=4087: 6.67 6.00 ( 9.99%) len=18, pos=17, align=0: 4.01 4.68 (-16.56%) len=18, pos=17, align=0: 4.01 4.68 (-16.56%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 4.01 4.68 (-16.57%) len=19, pos=18, align=0: 4.01 4.68 (-16.58%) len=19, pos=18, align=4086: 6.83 6.00 ( 12.11%) len=20, pos=19, align=0: 4.01 4.68 (-16.61%) len=20, pos=19, align=0: 4.01 4.68 (-16.66%) len=20, pos=19, align=4086: 6.67 6.14 ( 7.93%) len=21, pos=20, align=0: 4.01 4.68 (-16.59%) len=21, pos=20, align=0: 4.01 4.68 (-16.60%) len=21, pos=20, align=4085: 6.67 6.00 ( 10.00%) len=22, pos=21, align=0: 4.01 4.68 (-16.56%) len=22, pos=21, align=0: 4.01 4.68 (-16.61%) len=22, pos=21, align=4085: 6.67 6.00 ( 10.00%) len=23, pos=22, align=0: 4.01 4.68 (-16.58%) len=23, pos=22, align=0: 4.01 4.68 (-16.53%) len=23, pos=22, align=4084: 6.67 6.00 ( 10.00%) len=24, pos=23, align=0: 4.01 4.68 (-16.58%) len=24, pos=23, align=0: 4.16 4.68 (-12.36%) len=24, pos=23, align=4084: 6.67 6.00 ( 9.98%) len=25, pos=24, align=0: 4.01 4.68 (-16.60%) len=25, pos=24, align=0: 4.02 4.68 (-16.47%) len=25, pos=24, align=4083: 6.67 6.00 ( 9.99%) len=26, pos=25, align=0: 4.01 4.68 (-16.58%) len=26, pos=25, align=0: 4.01 4.68 (-16.57%) len=26, pos=25, align=4083: 6.67 6.00 ( 9.99%) len=27, pos=26, align=0: 4.01 4.68 (-16.57%) len=27, pos=26, align=0: 4.01 4.68 (-16.57%) len=27, pos=26, align=4082: 6.67 6.00 ( 10.00%) len=28, pos=27, align=0: 4.01 4.68 (-16.55%) len=28, pos=27, align=0: 4.01 4.68 (-16.59%) len=28, pos=27, align=4082: 6.83 6.00 ( 12.11%) len=29, pos=28, align=0: 4.01 4.68 (-16.61%) len=29, pos=28, align=0: 4.01 4.68 (-16.56%) len=29, pos=28, align=4081: 6.67 6.00 ( 9.99%) len=30, pos=29, align=0: 4.01 4.83 (-20.49%) len=30, pos=29, align=0: 4.01 4.68 (-16.57%) len=30, pos=29, align=4081: 6.67 6.00 ( 10.00%) len=31, pos=30, align=0: 4.01 4.68 (-16.54%) len=31, pos=30, align=0: 4.01 4.68 (-16.58%) len=31, pos=30, align=4080: 6.67 6.00 ( 9.99%) len=32, pos=31, align=0: 6.67 4.68 ( 29.91%) len=32, pos=31, align=0: 6.00 4.67 ( 22.24%) len=32, pos=31, align=4080: 9.16 6.00 ( 34.51%) len=2048, pos=32, align=0: 88.33 44.81 ( 49.28%) len=2048, pos=32, align=1: 86.53 43.80 ( 49.39%) len=2048, pos=64, align=0: 88.52 43.85 ( 50.46%) len=2048, pos=64, align=2: 86.64 43.87 ( 49.36%) len=2048, pos=128, align=0: 61.30 44.25 ( 27.82%) len=2048, pos=128, align=3: 61.02 44.11 ( 27.72%) len=2048, pos=256, align=0: 63.58 44.55 ( 29.93%) len=2048, pos=256, align=4: 63.56 44.44 ( 30.09%) len=2048, pos=512, align=0: 63.08 45.16 ( 28.40%) len=2048, pos=512, align=5: 62.77 44.46 ( 29.17%) len=2048, pos=1024, align=0: 63.36 45.16 ( 28.74%) len=2048, pos=1024, align=6: 65.55 46.72 ( 28.73%) len=2048, pos=2048, align=0: 62.27 42.25 ( 32.16%) len=2048, pos=2048, align=7: 62.75 42.16 ( 32.82%) len=2048, pos=4096, align=0: 66.74 46.42 ( 30.45%) len=2048, pos=4096, align=8: 61.43 43.88 ( 28.57%) len=256, pos=64, align=1: 14.69 13.93 ( 5.13%) len=256, pos=64, align=1: 14.86 15.09 ( -1.50%) len=256, pos=64, align=15: 14.00 14.05 ( -0.35%) len=256, pos=64, align=15: 14.05 14.47 ( -2.97%) len=256, pos=64, align=2: 14.46 14.47 ( -0.13%) len=256, pos=64, align=2: 13.95 14.51 ( -3.99%) len=256, pos=64, align=30: 14.00 14.52 ( -3.71%) len=256, pos=64, align=30: 13.95 14.89 ( -6.72%) len=256, pos=64, align=3: 14.11 14.52 ( -2.85%) len=256, pos=64, align=3: 13.42 13.94 ( -3.92%) len=256, pos=64, align=45: 13.95 14.49 ( -3.84%) len=256, pos=64, align=45: 14.52 14.41 ( 0.71%) len=256, pos=64, align=4: 14.60 14.35 ( 1.66%) len=256, pos=64, align=4: 14.71 14.34 ( 2.52%) len=256, pos=64, align=60: 14.03 14.31 ( -1.99%) len=256, pos=64, align=60: 14.59 14.01 ( 3.98%) len=256, pos=64, align=5: 15.26 14.55 ( 4.62%) len=256, pos=64, align=5: 13.98 14.04 ( -0.45%) len=256, pos=64, align=75: 14.65 13.46 ( 8.13%) len=256, pos=64, align=75: 13.89 13.45 ( 3.15%) len=256, pos=64, align=6: 14.00 14.70 ( -5.03%) len=256, pos=64, align=6: 13.96 14.59 ( -4.57%) len=256, pos=64, align=90: 14.00 13.38 ( 4.40%) len=256, pos=64, align=90: 14.61 15.13 ( -3.58%) len=256, pos=64, align=7: 14.02 13.98 ( 0.26%) len=256, pos=64, align=7: 13.90 14.01 ( -0.77%) len=256, pos=64, align=105: 13.89 15.27 ( -9.90%) len=256, pos=64, align=105: 13.98 15.07 ( -7.82%) len=1, pos=0, align=0: 4.00 4.67 (-16.66%) len=1, pos=0, align=0: 4.00 4.83 (-20.66%) len=1, pos=0, align=4095: 6.67 6.00 ( 10.02%) len=2, pos=1, align=0: 4.00 4.67 (-16.65%) len=2, pos=1, align=0: 4.00 4.67 (-16.67%) len=2, pos=1, align=4095: 6.67 6.00 ( 9.99%) len=3, pos=2, align=0: 4.00 4.68 (-16.87%) len=3, pos=2, align=0: 4.00 4.68 (-16.95%) len=3, pos=2, align=4094: 6.67 6.18 ( 7.33%) len=4, pos=3, align=0: 4.00 4.68 (-16.89%) len=4, pos=3, align=0: 4.00 4.84 (-21.02%) len=4, pos=3, align=4094: 6.67 6.00 ( 9.99%) len=5, pos=4, align=0: 4.00 4.68 (-16.88%) len=5, pos=4, align=0: 4.00 4.68 (-16.90%) len=5, pos=4, align=4093: 6.67 6.00 ( 9.99%) len=6, pos=5, align=0: 4.00 4.68 (-16.83%) len=6, pos=5, align=0: 4.00 4.68 (-16.91%) len=6, pos=5, align=4093: 6.67 6.00 ( 10.00%) len=7, pos=6, align=0: 4.00 4.68 (-16.91%) len=7, pos=6, align=0: 4.00 4.68 (-16.87%) len=7, pos=6, align=4092: 6.67 6.00 ( 10.00%) len=8, pos=7, align=0: 4.00 4.68 (-16.87%) len=8, pos=7, align=0: 4.00 4.68 (-16.87%) len=8, pos=7, align=4092: 6.67 6.00 ( 9.99%) len=9, pos=8, align=0: 4.00 4.67 (-16.64%) len=9, pos=8, align=0: 4.00 4.67 (-16.70%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.00 4.68 (-16.98%) len=10, pos=9, align=0: 4.00 4.68 (-16.88%) len=10, pos=9, align=4091: 6.67 6.00 ( 9.99%) len=11, pos=10, align=0: 4.00 4.68 (-16.88%) len=11, pos=10, align=0: 4.00 4.68 (-16.95%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.00 4.68 (-16.94%) len=12, pos=11, align=0: 4.00 4.68 (-16.91%) len=12, pos=11, align=4090: 6.67 6.00 ( 9.99%) len=13, pos=12, align=0: 4.00 4.84 (-21.00%) len=13, pos=12, align=0: 4.00 4.67 (-16.66%) len=13, pos=12, align=4089: 6.83 6.00 ( 12.19%) len=14, pos=13, align=0: 4.01 4.68 (-16.56%) len=14, pos=13, align=0: 4.01 4.68 (-16.63%) len=14, pos=13, align=4089: 6.67 6.00 ( 10.00%) len=15, pos=14, align=0: 4.01 4.68 (-16.54%) len=15, pos=14, align=0: 4.01 4.68 (-16.57%) len=15, pos=14, align=4088: 6.67 6.00 ( 9.99%) len=16, pos=15, align=0: 4.01 4.83 (-20.49%) len=16, pos=15, align=0: 4.01 4.67 (-16.34%) len=16, pos=15, align=4088: 6.82 6.00 ( 12.06%) len=17, pos=16, align=0: 4.01 4.68 (-16.53%) len=17, pos=16, align=0: 4.01 4.68 (-16.57%) len=17, pos=16, align=4087: 7.06 6.36 ( 9.97%) len=18, pos=17, align=0: 4.95 4.68 ( 5.47%) len=18, pos=17, align=0: 4.00 4.68 (-16.90%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 4.00 4.68 (-16.88%) len=19, pos=18, align=0: 4.00 4.84 (-20.92%) len=19, pos=18, align=4086: 6.67 6.00 ( 9.97%) len=20, pos=19, align=0: 4.00 4.68 (-16.85%) len=20, pos=19, align=0: 4.00 4.68 (-16.93%) len=20, pos=19, align=4086: 6.67 6.00 ( 9.99%) len=21, pos=20, align=0: 4.00 4.68 (-16.88%) len=21, pos=20, align=0: 4.00 4.68 (-16.95%) len=21, pos=20, align=4085: 6.67 6.00 ( 9.99%) len=22, pos=21, align=0: 4.00 4.87 (-21.67%) len=22, pos=21, align=0: 4.00 4.68 (-16.96%) len=22, pos=21, align=4085: 6.67 6.00 ( 9.99%) len=23, pos=22, align=0: 4.00 4.68 (-16.88%) len=23, pos=22, align=0: 4.00 4.68 (-16.88%) len=23, pos=22, align=4084: 6.67 6.00 ( 9.99%) len=24, pos=23, align=0: 4.00 4.68 (-16.85%) len=24, pos=23, align=0: 4.00 4.68 (-16.88%) len=24, pos=23, align=4084: 6.67 6.00 ( 9.99%) len=25, pos=24, align=0: 4.00 4.68 (-16.88%) len=25, pos=24, align=0: 4.00 4.68 (-16.88%) len=25, pos=24, align=4083: 6.67 6.00 ( 9.99%) len=26, pos=25, align=0: 4.00 4.83 (-20.79%) len=26, pos=25, align=0: 4.00 4.67 (-16.67%) len=26, pos=25, align=4083: 6.67 6.00 ( 9.96%) len=27, pos=26, align=0: 4.16 4.68 (-12.35%) len=27, pos=26, align=0: 4.00 4.68 (-16.91%) len=27, pos=26, align=4082: 6.67 6.00 ( 9.99%) len=28, pos=27, align=0: 4.00 4.68 (-16.90%) len=28, pos=27, align=0: 4.00 4.68 (-16.91%) len=28, pos=27, align=4082: 6.67 6.00 ( 10.00%) len=29, pos=28, align=0: 4.00 4.68 (-16.88%) len=29, pos=28, align=0: 4.00 4.84 (-20.87%) len=29, pos=28, align=4081: 6.67 6.00 ( 9.99%) len=30, pos=29, align=0: 4.00 4.68 (-16.87%) len=30, pos=29, align=0: 4.00 4.68 (-16.91%) len=30, pos=29, align=4081: 6.67 6.00 ( 9.99%) len=31, pos=30, align=0: 4.00 4.68 (-16.91%) len=31, pos=30, align=0: 4.00 4.68 (-16.91%) len=31, pos=30, align=4080: 6.67 6.00 ( 9.97%) len=32, pos=31, align=0: 6.01 4.84 ( 19.50%) len=32, pos=31, align=0: 6.00 4.68 ( 22.05%) len=32, pos=31, align=4080: 9.00 6.00 ( 33.34%) len=2048, pos=32, align=0: 85.36 44.00 ( 48.46%) len=2048, pos=32, align=1: 86.66 43.76 ( 49.50%) len=2048, pos=64, align=0: 90.74 43.79 ( 51.74%) len=2048, pos=64, align=2: 86.03 43.95 ( 48.91%) len=2048, pos=128, align=0: 62.48 43.37 ( 30.59%) len=2048, pos=128, align=3: 61.36 44.03 ( 28.24%) len=2048, pos=256, align=0: 61.49 44.26 ( 28.02%) len=2048, pos=256, align=4: 61.74 44.60 ( 27.77%) len=2048, pos=512, align=0: 65.32 46.82 ( 28.33%) len=2048, pos=512, align=5: 66.22 47.36 ( 28.49%) len=2048, pos=1024, align=0: 65.56 48.78 ( 25.60%) len=2048, pos=1024, align=6: 66.16 49.63 ( 24.99%) len=2048, pos=2048, align=0: 64.26 44.68 ( 30.46%) len=2048, pos=2048, align=7: 64.79 44.85 ( 30.78%) len=2048, pos=4096, align=0: 62.72 42.25 ( 32.63%) len=2048, pos=4096, align=8: 61.97 43.09 ( 30.47%) len=256, pos=64, align=1: 13.90 14.57 ( -4.84%) len=256, pos=64, align=1: 13.90 14.44 ( -3.93%) len=256, pos=64, align=15: 14.44 14.49 ( -0.39%) len=256, pos=64, align=15: 13.96 14.47 ( -3.66%) len=256, pos=64, align=2: 14.04 14.29 ( -1.75%) len=256, pos=64, align=2: 14.57 14.41 ( 1.11%) len=256, pos=64, align=30: 13.92 14.50 ( -4.19%) len=256, pos=64, align=30: 14.14 14.50 ( -2.57%) len=256, pos=64, align=3: 13.96 14.31 ( -2.56%) len=256, pos=64, align=3: 13.89 15.16 ( -9.15%) len=256, pos=64, align=45: 14.00 14.34 ( -2.41%) len=256, pos=64, align=45: 13.94 14.53 ( -4.22%) len=256, pos=64, align=4: 14.80 14.54 ( 1.78%) len=256, pos=64, align=4: 13.96 14.29 ( -2.37%) len=256, pos=64, align=60: 13.92 14.54 ( -4.46%) len=256, pos=64, align=60: 13.89 14.20 ( -2.21%) len=256, pos=64, align=5: 13.98 14.04 ( -0.38%) len=256, pos=64, align=5: 14.46 14.00 ( 3.23%) len=256, pos=64, align=75: 13.97 13.45 ( 3.74%) len=256, pos=64, align=75: 14.94 13.49 ( 9.76%) len=256, pos=64, align=6: 14.01 14.42 ( -2.87%) len=256, pos=64, align=6: 14.01 13.99 ( 0.15%) len=256, pos=64, align=90: 14.18 14.42 ( -1.68%) len=256, pos=64, align=90: 13.89 13.43 ( 3.28%) len=256, pos=64, align=7: 13.99 13.97 ( 0.14%) len=256, pos=64, align=7: 14.01 15.04 ( -7.29%) len=256, pos=64, align=105: 14.03 13.46 ( 4.04%) len=256, pos=64, align=105: 14.49 13.45 ( 7.17%) len=1, pos=0, align=0: 4.00 4.68 (-17.04%) len=1, pos=0, align=0: 4.00 4.69 (-17.08%) len=1, pos=0, align=4095: 6.67 6.00 ( 9.96%) len=2, pos=1, align=0: 4.00 4.68 (-16.91%) len=2, pos=1, align=0: 4.00 4.68 (-16.90%) len=2, pos=1, align=4095: 6.67 6.00 ( 9.99%) len=3, pos=2, align=0: 4.00 4.68 (-16.89%) len=3, pos=2, align=0: 4.16 4.68 (-12.41%) len=3, pos=2, align=4094: 6.67 6.00 ( 10.01%) len=4, pos=3, align=0: 4.16 4.68 (-12.47%) len=4, pos=3, align=0: 4.00 4.68 (-16.93%) len=4, pos=3, align=4094: 6.67 6.16 ( 7.61%) len=5, pos=4, align=0: 4.00 4.68 (-16.91%) len=5, pos=4, align=0: 4.00 4.68 (-16.92%) len=5, pos=4, align=4093: 6.67 6.00 ( 10.00%) len=6, pos=5, align=0: 4.00 4.68 (-16.95%) len=6, pos=5, align=0: 4.00 4.68 (-16.92%) len=6, pos=5, align=4093: 6.67 6.00 ( 9.99%) len=7, pos=6, align=0: 4.00 4.68 (-16.90%) len=7, pos=6, align=0: 4.00 4.68 (-16.92%) len=7, pos=6, align=4092: 6.67 6.00 ( 10.00%) len=8, pos=7, align=0: 4.00 4.68 (-16.88%) len=8, pos=7, align=0: 4.00 4.68 (-16.90%) len=8, pos=7, align=4092: 6.67 6.00 ( 9.99%) len=9, pos=8, align=0: 4.00 4.68 (-16.92%) len=9, pos=8, align=0: 4.00 4.68 (-16.90%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.00 4.68 (-16.94%) len=10, pos=9, align=0: 4.00 4.68 (-16.88%) len=10, pos=9, align=4091: 6.67 6.00 ( 9.99%) len=11, pos=10, align=0: 4.00 4.68 (-16.90%) len=11, pos=10, align=0: 4.00 4.68 (-16.91%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.00 4.68 (-16.93%) len=12, pos=11, align=0: 4.00 4.68 (-16.91%) len=12, pos=11, align=4090: 6.67 6.16 ( 7.60%) len=13, pos=12, align=0: 4.00 4.68 (-16.89%) len=13, pos=12, align=0: 4.16 4.68 (-12.44%) len=13, pos=12, align=4089: 6.67 6.00 ( 10.00%) len=14, pos=13, align=0: 4.00 4.68 (-16.91%) len=14, pos=13, align=0: 4.00 4.68 (-16.89%) len=14, pos=13, align=4089: 6.67 6.00 ( 9.99%) len=15, pos=14, align=0: 4.00 4.68 (-16.91%) len=15, pos=14, align=0: 4.00 4.68 (-16.88%) len=15, pos=14, align=4088: 6.67 6.00 ( 9.97%) len=16, pos=15, align=0: 4.00 4.68 (-16.91%) len=16, pos=15, align=0: 4.16 4.68 (-12.45%) len=16, pos=15, align=4088: 6.67 6.00 ( 9.99%) len=17, pos=16, align=0: 4.00 4.68 (-16.92%) len=17, pos=16, align=0: 4.00 4.68 (-16.91%) len=17, pos=16, align=4087: 6.67 6.00 ( 9.99%) len=18, pos=17, align=0: 4.00 4.84 (-20.88%) len=18, pos=17, align=0: 4.00 4.68 (-16.87%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 4.00 4.68 (-16.91%) len=19, pos=18, align=0: 4.00 4.68 (-16.90%) len=19, pos=18, align=4086: 6.67 6.16 ( 7.58%) len=20, pos=19, align=0: 4.00 4.68 (-16.88%) len=20, pos=19, align=0: 4.00 4.68 (-16.90%) len=20, pos=19, align=4086: 6.67 6.00 ( 9.99%) len=21, pos=20, align=0: 4.00 4.68 (-16.89%) len=21, pos=20, align=0: 4.00 4.68 (-16.90%) len=21, pos=20, align=4085: 6.67 6.00 ( 9.99%) len=22, pos=21, align=0: 4.00 4.68 (-16.97%) len=22, pos=21, align=0: 4.16 4.68 (-12.44%) len=22, pos=21, align=4085: 6.67 6.00 ( 9.99%) len=23, pos=22, align=0: 4.00 4.68 (-16.90%) len=23, pos=22, align=0: 4.00 4.83 (-20.79%) len=23, pos=22, align=4084: 6.67 6.00 ( 10.00%) len=24, pos=23, align=0: 4.00 4.68 (-16.91%) len=24, pos=23, align=0: 4.00 4.83 (-20.81%) len=24, pos=23, align=4084: 6.67 6.00 ( 10.00%) len=25, pos=24, align=0: 4.00 4.68 (-16.93%) len=25, pos=24, align=0: 4.00 4.83 (-20.84%) len=25, pos=24, align=4083: 6.67 6.00 ( 9.99%) len=26, pos=25, align=0: 4.00 4.68 (-16.91%) len=26, pos=25, align=0: 4.16 4.68 (-12.47%) len=26, pos=25, align=4083: 6.67 6.00 ( 10.00%) len=27, pos=26, align=0: 4.00 4.68 (-16.90%) len=27, pos=26, align=0: 4.00 4.68 (-16.90%) len=27, pos=26, align=4082: 6.67 6.00 ( 9.99%) len=28, pos=27, align=0: 4.00 4.68 (-16.91%) len=28, pos=27, align=0: 4.00 4.68 (-16.90%) len=28, pos=27, align=4082: 6.67 6.17 ( 7.53%) len=29, pos=28, align=0: 4.00 4.68 (-16.89%) len=29, pos=28, align=0: 4.00 4.68 (-16.91%) len=29, pos=28, align=4081: 6.67 6.00 ( 9.99%) len=30, pos=29, align=0: 4.00 4.67 (-16.67%) len=30, pos=29, align=0: 4.00 4.67 (-16.66%) len=30, pos=29, align=4081: 6.67 6.00 ( 9.99%) len=31, pos=30, align=0: 4.00 4.68 (-16.90%) len=31, pos=30, align=0: 4.00 4.68 (-16.91%) len=31, pos=30, align=4080: 6.67 6.00 ( 9.99%) len=32, pos=31, align=0: 6.00 4.68 ( 22.07%) len=32, pos=31, align=0: 6.00 4.68 ( 22.09%) len=32, pos=31, align=4080: 9.00 6.16 ( 31.53%) len=2048, pos=32, align=0: 64.76 43.76 ( 32.43%) len=2048, pos=32, align=1: 59.89 43.78 ( 26.90%) len=2048, pos=64, align=0: 63.03 46.42 ( 26.36%) len=2048, pos=64, align=2: 60.22 43.99 ( 26.95%) len=2048, pos=128, align=0: 60.54 44.18 ( 27.03%) len=2048, pos=128, align=3: 61.40 43.48 ( 29.19%) len=2048, pos=256, align=0: 62.04 44.21 ( 28.73%) len=2048, pos=256, align=4: 61.04 46.58 ( 23.68%) len=2048, pos=512, align=0: 67.69 45.74 ( 32.43%) len=2048, pos=512, align=5: 68.13 47.23 ( 30.68%) len=2048, pos=1024, align=0: 72.87 52.68 ( 27.71%) len=2048, pos=1024, align=6: 73.18 54.81 ( 25.10%) len=2048, pos=2048, align=0: 67.89 55.91 ( 17.64%) len=2048, pos=2048, align=7: 68.21 55.83 ( 18.15%) len=2048, pos=4096, align=0: 63.24 45.10 ( 28.68%) len=2048, pos=4096, align=8: 61.78 45.91 ( 25.68%) len=256, pos=64, align=1: 13.99 14.56 ( -4.06%) len=256, pos=64, align=1: 13.94 14.57 ( -4.49%) len=256, pos=64, align=15: 14.00 14.51 ( -3.65%) len=256, pos=64, align=15: 15.77 14.49 ( 8.14%) len=256, pos=64, align=2: 14.03 15.27 ( -8.86%) len=256, pos=64, align=2: 13.89 14.49 ( -4.30%) len=256, pos=64, align=30: 14.14 14.04 ( 0.71%) len=256, pos=64, align=30: 13.46 13.98 ( -3.91%) len=256, pos=64, align=3: 13.90 14.50 ( -4.33%) len=256, pos=64, align=3: 14.16 14.48 ( -2.27%) len=256, pos=64, align=45: 13.88 14.61 ( -5.24%) len=256, pos=64, align=45: 14.00 14.56 ( -4.01%) len=256, pos=64, align=4: 13.94 14.70 ( -5.46%) len=256, pos=64, align=4: 13.91 14.44 ( -3.82%) len=256, pos=64, align=60: 14.01 14.90 ( -6.39%) len=256, pos=64, align=60: 14.01 14.55 ( -3.90%) len=256, pos=64, align=5: 15.16 14.59 ( 3.79%) len=256, pos=64, align=5: 13.93 14.53 ( -4.28%) len=256, pos=64, align=75: 14.02 13.43 ( 4.21%) len=256, pos=64, align=75: 14.00 13.39 ( 4.36%) len=256, pos=64, align=6: 13.97 14.51 ( -3.85%) len=256, pos=64, align=6: 14.17 14.54 ( -2.61%) len=256, pos=64, align=90: 13.89 13.50 ( 2.80%) len=256, pos=64, align=90: 14.17 14.41 ( -1.74%) len=256, pos=64, align=7: 13.90 14.04 ( -1.05%) len=256, pos=64, align=7: 14.00 14.04 ( -0.27%) len=256, pos=64, align=105: 13.98 13.57 ( 2.89%) len=256, pos=64, align=105: 13.88 15.42 (-11.09%) len=1, pos=0, align=0: 4.00 4.67 (-16.68%) len=1, pos=0, align=0: 4.00 4.67 (-16.65%) len=1, pos=0, align=4095: 6.67 6.00 ( 10.03%) len=2, pos=1, align=0: 4.00 4.67 (-16.64%) len=2, pos=1, align=0: 4.00 4.67 (-16.65%) len=2, pos=1, align=4095: 6.67 6.00 ( 9.97%) len=3, pos=2, align=0: 4.00 4.88 (-21.90%) len=3, pos=2, align=0: 4.00 4.67 (-16.63%) len=3, pos=2, align=4094: 6.67 6.00 ( 9.97%) len=4, pos=3, align=0: 4.00 4.67 (-16.66%) len=4, pos=3, align=0: 4.00 4.67 (-16.67%) len=4, pos=3, align=4094: 6.67 6.00 ( 10.00%) len=5, pos=4, align=0: 4.00 4.67 (-16.66%) len=5, pos=4, align=0: 4.00 4.67 (-16.63%) len=5, pos=4, align=4093: 6.67 6.00 ( 10.00%) len=6, pos=5, align=0: 4.00 4.67 (-16.65%) len=6, pos=5, align=0: 4.00 4.67 (-16.66%) len=6, pos=5, align=4093: 6.67 6.00 ( 9.99%) len=7, pos=6, align=0: 4.00 4.67 (-16.65%) len=7, pos=6, align=0: 4.00 4.67 (-16.63%) len=7, pos=6, align=4092: 6.67 6.00 ( 10.00%) len=8, pos=7, align=0: 4.00 4.67 (-16.65%) len=8, pos=7, align=0: 4.00 4.67 (-16.66%) len=8, pos=7, align=4092: 6.85 6.00 ( 12.35%) len=9, pos=8, align=0: 4.00 4.67 (-16.66%) len=9, pos=8, align=0: 4.00 4.67 (-16.62%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.00 4.67 (-16.60%) len=10, pos=9, align=0: 4.00 4.67 (-16.65%) len=10, pos=9, align=4091: 6.67 6.00 ( 9.99%) len=11, pos=10, align=0: 4.00 4.67 (-16.60%) len=11, pos=10, align=0: 4.00 4.67 (-16.65%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.00 4.67 (-16.63%) len=12, pos=11, align=0: 4.00 4.67 (-16.66%) len=12, pos=11, align=4090: 6.67 6.00 ( 9.99%) len=13, pos=12, align=0: 4.00 4.67 (-16.63%) len=13, pos=12, align=0: 4.00 4.67 (-16.70%) len=13, pos=12, align=4089: 6.67 6.00 ( 9.99%) len=14, pos=13, align=0: 4.00 4.67 (-16.71%) len=14, pos=13, align=0: 4.00 4.83 (-20.69%) len=14, pos=13, align=4089: 6.67 6.00 ( 9.99%) len=15, pos=14, align=0: 4.00 4.67 (-16.66%) len=15, pos=14, align=0: 4.00 4.67 (-16.65%) len=15, pos=14, align=4088: 6.67 6.00 ( 9.97%) len=16, pos=15, align=0: 4.00 4.67 (-16.70%) len=16, pos=15, align=0: 4.00 4.67 (-16.65%) len=16, pos=15, align=4088: 6.67 6.00 ( 9.99%) len=17, pos=16, align=0: 4.00 4.67 (-16.66%) len=17, pos=16, align=0: 4.16 4.67 (-12.07%) len=17, pos=16, align=4087: 6.67 6.00 ( 9.99%) len=18, pos=17, align=0: 4.00 4.67 (-16.65%) len=18, pos=17, align=0: 4.00 4.67 (-16.66%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 4.00 4.67 (-16.65%) len=19, pos=18, align=0: 4.00 4.67 (-16.65%) len=19, pos=18, align=4086: 6.67 6.00 ( 9.99%) len=20, pos=19, align=0: 4.00 4.67 (-16.70%) len=20, pos=19, align=0: 4.00 4.67 (-16.66%) len=20, pos=19, align=4086: 6.67 6.00 ( 9.96%) len=21, pos=20, align=0: 4.00 4.82 (-20.53%) len=21, pos=20, align=0: 4.00 4.67 (-16.65%) len=21, pos=20, align=4085: 6.67 6.00 ( 9.99%) len=22, pos=21, align=0: 4.00 4.67 (-16.63%) len=22, pos=21, align=0: 4.00 4.67 (-16.65%) len=22, pos=21, align=4085: 6.67 6.00 ( 9.99%) len=23, pos=22, align=0: 4.00 4.67 (-16.65%) len=23, pos=22, align=0: 4.00 4.67 (-16.70%) len=23, pos=22, align=4084: 6.67 6.00 ( 9.96%) len=24, pos=23, align=0: 4.00 4.67 (-16.65%) len=24, pos=23, align=0: 4.00 4.67 (-16.70%) len=24, pos=23, align=4084: 6.67 6.00 ( 9.99%) len=25, pos=24, align=0: 4.00 4.67 (-16.63%) len=25, pos=24, align=0: 4.00 4.67 (-16.66%) len=25, pos=24, align=4083: 6.67 6.00 ( 9.97%) len=26, pos=25, align=0: 4.00 4.67 (-16.65%) len=26, pos=25, align=0: 4.00 4.67 (-16.64%) len=26, pos=25, align=4083: 6.67 6.00 ( 9.99%) len=27, pos=26, align=0: 4.00 4.83 (-20.66%) len=27, pos=26, align=0: 4.00 4.67 (-16.66%) len=27, pos=26, align=4082: 6.67 6.00 ( 9.99%) len=28, pos=27, align=0: 4.00 4.67 (-16.66%) len=28, pos=27, align=0: 4.00 4.67 (-16.65%) len=28, pos=27, align=4082: 6.67 6.00 ( 9.99%) len=29, pos=28, align=0: 4.00 4.67 (-16.62%) len=29, pos=28, align=0: 4.00 4.67 (-16.65%) len=29, pos=28, align=4081: 6.67 6.00 ( 10.00%) len=30, pos=29, align=0: 4.00 4.67 (-16.65%) len=30, pos=29, align=0: 4.00 4.67 (-16.65%) len=30, pos=29, align=4081: 6.67 6.00 ( 9.99%) len=31, pos=30, align=0: 4.00 4.67 (-16.63%) len=31, pos=30, align=0: 4.00 4.67 (-16.66%) len=31, pos=30, align=4080: 6.67 6.00 ( 10.02%) len=32, pos=31, align=0: 6.01 4.67 ( 22.28%) len=32, pos=31, align=0: 6.00 4.67 ( 22.20%) len=32, pos=31, align=4080: 9.00 6.00 ( 33.34%) len=2048, pos=32, align=0: 66.79 43.91 ( 34.25%) len=2048, pos=32, align=1: 86.30 43.76 ( 49.29%) len=2048, pos=64, align=0: 93.06 48.59 ( 47.78%) len=2048, pos=64, align=2: 91.49 43.89 ( 52.03%) len=2048, pos=128, align=0: 60.63 42.77 ( 29.46%) len=2048, pos=128, align=3: 61.23 42.61 ( 30.41%) len=2048, pos=256, align=0: 62.65 44.54 ( 28.92%) len=2048, pos=256, align=4: 62.56 44.47 ( 28.91%) len=2048, pos=512, align=0: 69.30 47.13 ( 31.99%) len=2048, pos=512, align=5: 68.40 48.17 ( 29.57%) len=2048, pos=1024, align=0: 80.41 52.68 ( 34.48%) len=2048, pos=1024, align=6: 80.58 52.92 ( 34.32%) len=2048, pos=2048, align=0: 78.58 67.28 ( 14.37%) len=2048, pos=2048, align=7: 78.98 67.49 ( 14.54%) len=2048, pos=4096, align=0: 67.66 55.91 ( 17.37%) len=2048, pos=4096, align=8: 67.71 55.39 ( 18.19%) len=256, pos=64, align=1: 15.74 14.18 ( 9.91%) len=256, pos=64, align=1: 16.02 14.31 ( 10.72%) len=256, pos=64, align=15: 14.22 14.06 ( 1.11%) len=256, pos=64, align=15: 13.89 13.96 ( -0.47%) len=256, pos=64, align=2: 14.01 14.03 ( -0.14%) len=256, pos=64, align=2: 13.95 14.21 ( -1.89%) len=256, pos=64, align=30: 13.89 13.98 ( -0.69%) len=256, pos=64, align=30: 14.08 14.05 ( 0.21%) len=256, pos=64, align=3: 14.08 13.97 ( 0.80%) len=256, pos=64, align=3: 13.89 14.28 ( -2.80%) len=256, pos=64, align=45: 14.20 14.14 ( 0.43%) len=256, pos=64, align=45: 13.90 14.29 ( -2.76%) len=256, pos=64, align=4: 13.96 15.58 (-11.58%) len=256, pos=64, align=4: 14.58 16.07 (-10.16%) len=256, pos=64, align=60: 14.54 14.31 ( 1.58%) len=256, pos=64, align=60: 14.14 14.03 ( 0.77%) len=256, pos=64, align=5: 13.93 14.04 ( -0.79%) len=256, pos=64, align=5: 13.43 13.99 ( -4.16%) len=256, pos=64, align=75: 13.92 13.94 ( -0.10%) len=256, pos=64, align=75: 13.99 13.77 ( 1.54%) len=256, pos=64, align=6: 14.60 14.04 ( 3.89%) len=256, pos=64, align=6: 14.18 13.97 ( 1.46%) len=256, pos=64, align=90: 14.03 13.45 ( 4.19%) len=256, pos=64, align=90: 14.58 13.91 ( 4.56%) len=256, pos=64, align=7: 14.02 14.28 ( -1.81%) len=256, pos=64, align=7: 14.11 14.28 ( -1.17%) len=256, pos=64, align=105: 13.88 13.56 ( 2.31%) len=256, pos=64, align=105: 14.57 13.46 ( 7.61%) len=1, pos=0, align=0: 4.00 4.67 (-16.59%) len=1, pos=0, align=0: 4.00 4.67 (-16.66%) len=1, pos=0, align=4095: 6.83 6.00 ( 12.09%) len=2, pos=1, align=0: 4.01 4.67 (-16.35%) len=2, pos=1, align=0: 4.01 4.67 (-16.33%) len=2, pos=1, align=4095: 6.67 6.00 ( 9.99%) len=3, pos=2, align=0: 4.00 4.68 (-16.95%) len=3, pos=2, align=0: 4.00 4.68 (-16.89%) len=3, pos=2, align=4094: 6.67 6.00 ( 10.00%) len=4, pos=3, align=0: 4.01 4.68 (-16.60%) len=4, pos=3, align=0: 4.01 4.68 (-16.59%) len=4, pos=3, align=4094: 6.67 6.00 ( 9.99%) len=5, pos=4, align=0: 4.01 4.68 (-16.59%) len=5, pos=4, align=0: 4.01 4.68 (-16.62%) len=5, pos=4, align=4093: 6.67 6.00 ( 9.99%) len=6, pos=5, align=0: 4.01 4.68 (-16.59%) len=6, pos=5, align=0: 4.01 4.68 (-16.62%) len=6, pos=5, align=4093: 6.83 6.00 ( 12.08%) len=7, pos=6, align=0: 4.01 4.68 (-16.57%) len=7, pos=6, align=0: 4.01 4.84 (-20.65%) len=7, pos=6, align=4092: 6.67 6.00 ( 9.97%) len=8, pos=7, align=0: 4.24 4.68 (-10.43%) len=8, pos=7, align=0: 4.00 4.68 (-16.95%) len=8, pos=7, align=4092: 6.67 6.17 ( 7.50%) len=9, pos=8, align=0: 4.01 4.68 (-16.62%) len=9, pos=8, align=0: 4.01 4.84 (-20.54%) len=9, pos=8, align=4091: 6.67 6.00 ( 9.99%) len=10, pos=9, align=0: 4.01 4.84 (-20.63%) len=10, pos=9, align=0: 4.01 4.68 (-16.66%) len=10, pos=9, align=4091: 6.67 6.16 ( 7.57%) len=11, pos=10, align=0: 4.01 4.68 (-16.64%) len=11, pos=10, align=0: 4.00 4.85 (-21.08%) len=11, pos=10, align=4090: 6.67 6.00 ( 9.99%) len=12, pos=11, align=0: 4.01 4.68 (-16.60%) len=12, pos=11, align=0: 4.01 4.68 (-16.64%) len=12, pos=11, align=4090: 6.83 6.00 ( 12.12%) len=13, pos=12, align=0: 4.01 4.68 (-16.65%) len=13, pos=12, align=0: 4.01 4.68 (-16.63%) len=13, pos=12, align=4089: 6.67 6.00 ( 10.00%) len=14, pos=13, align=0: 4.01 4.68 (-16.61%) len=14, pos=13, align=0: 4.01 4.68 (-16.60%) len=14, pos=13, align=4089: 6.67 6.00 ( 9.99%) len=15, pos=14, align=0: 4.01 4.68 (-16.62%) len=15, pos=14, align=0: 4.01 4.68 (-16.59%) len=15, pos=14, align=4088: 6.67 6.18 ( 7.37%) len=16, pos=15, align=0: 4.01 4.67 (-16.40%) len=16, pos=15, align=0: 4.01 4.67 (-16.31%) len=16, pos=15, align=4088: 6.67 6.00 ( 9.99%) len=17, pos=16, align=0: 4.01 4.68 (-16.62%) len=17, pos=16, align=0: 4.01 4.68 (-16.59%) len=17, pos=16, align=4087: 6.67 6.00 ( 10.00%) len=18, pos=17, align=0: 4.01 4.68 (-16.63%) len=18, pos=17, align=0: 4.01 4.68 (-16.63%) len=18, pos=17, align=4087: 6.67 6.00 ( 9.99%) len=19, pos=18, align=0: 4.01 4.68 (-16.60%) len=19, pos=18, align=0: 4.01 4.68 (-16.65%) len=19, pos=18, align=4086: 6.67 6.00 ( 10.00%) len=20, pos=19, align=0: 5.35 4.96 ( 7.42%) len=20, pos=19, align=0: 4.24 4.96 (-17.01%) len=20, pos=19, align=4086: 7.06 6.35 ( 10.00%) len=21, pos=20, align=0: 4.25 4.96 (-16.65%) len=21, pos=20, align=0: 4.25 4.95 (-16.64%) len=21, pos=20, align=4085: 7.06 6.35 ( 9.99%) len=22, pos=21, align=0: 4.25 4.95 (-16.66%) len=22, pos=21, align=0: 4.25 4.96 (-16.70%) len=22, pos=21, align=4085: 7.06 6.35 ( 9.99%) len=23, pos=22, align=0: 4.25 4.95 (-16.61%) len=23, pos=22, align=0: 4.25 4.95 (-16.59%) len=23, pos=22, align=4084: 7.06 6.35 ( 10.00%) len=24, pos=23, align=0: 4.25 4.95 (-16.64%) len=24, pos=23, align=0: 4.25 4.96 (-16.63%) len=24, pos=23, align=4084: 7.06 6.35 ( 10.00%) len=25, pos=24, align=0: 4.25 4.95 (-16.60%) len=25, pos=24, align=0: 4.42 4.95 (-12.17%) len=25, pos=24, align=4083: 7.06 6.35 ( 9.99%) len=26, pos=25, align=0: 4.25 4.95 (-16.64%) len=26, pos=25, align=0: 4.25 4.95 (-16.64%) len=26, pos=25, align=4083: 7.06 6.57 ( 6.91%) len=27, pos=26, align=0: 4.25 4.94 (-16.39%) len=27, pos=26, align=0: 4.25 4.94 (-16.35%) len=27, pos=26, align=4082: 7.06 6.35 ( 9.99%) len=28, pos=27, align=0: 4.25 5.16 (-21.52%) len=28, pos=27, align=0: 4.25 4.96 (-16.68%) len=28, pos=27, align=4082: 7.06 6.35 ( 10.00%) len=29, pos=28, align=0: 4.25 4.95 (-16.61%) len=29, pos=28, align=0: 4.25 5.15 (-21.28%) len=29, pos=28, align=4081: 7.06 6.35 ( 9.99%) len=30, pos=29, align=0: 4.01 4.68 (-16.66%) len=30, pos=29, align=0: 4.01 4.68 (-16.62%) len=30, pos=29, align=4081: 6.87 6.00 ( 12.59%) len=31, pos=30, align=0: 4.01 4.68 (-16.66%) len=31, pos=30, align=0: 4.25 4.95 (-16.63%) len=31, pos=30, align=4080: 6.67 6.00 ( 9.99%) len=32, pos=31, align=0: 6.01 4.68 ( 22.09%) len=32, pos=31, align=0: 6.00 4.88 ( 18.74%) len=32, pos=31, align=4080: 8.67 6.00 ( 30.79%)
On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > This patch implements following evex512 version of string functions. > > > evex512 version takes up to 30% less cycle as compared to evex, > > > depending on length and alignment. > > > > > > > Please attach benchmark numbers. > > > > > - strrchr function using 512 bit vectors. > > > - wcsrchr function using 512 bit vectors. > > > > > > Code size data: > > > > > > strrchr-evex.o 833 byte > > > strrchr-evex512.o 573 byte (-31%) > > > > > > wcsrchr-evex.o 836 byte > > > wcsrchr-evex512.o 581 byte (-31%) > > > > > > Placeholder function, not used by any processor at the moment. > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 2 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > > > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > > > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > > > 5 files changed, 334 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > index df4601c294..6a275f1c3d 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -110,6 +110,7 @@ sysdep_routines += \ > > > strrchr-avx2 \ > > > strrchr-avx2-rtm \ > > > strrchr-evex \ > > > + strrchr-evex512 \ > > > strrchr-sse2 \ > > > strspn-sse4 \ > > > strstr-avx512 \ > > > @@ -152,6 +153,7 @@ sysdep_routines += \ > > > wcsrchr-avx2 \ > > > wcsrchr-avx2-rtm \ > > > wcsrchr-evex \ > > > + wcsrchr-evex512 \ > > > wcsrchr-sse2 \ > > > wmemchr-avx2 \ > > > wmemchr-avx2-rtm \ > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index a71444eccb..26c941023a 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > (CPU_FEATURE_USABLE (AVX512VL) > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > __strrchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __strrchr_evex512) > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > > > CPU_FEATURE_USABLE (AVX2), > > > __strrchr_avx2) > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wcsrchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > + && CPU_FEATURE_USABLE (BMI2)), > > > + __wcsrchr_evex512) > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > > > CPU_FEATURE_USABLE (AVX2), > > > __wcsrchr_avx2) > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > new file mode 100644 > > > index 0000000000..e937cb193c > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > @@ -0,0 +1,307 @@ > > > +/* Placeholder function, not used by any processor at the moment. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +/* UNUSED. Exists purely as reference implementation. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > +# include <sysdep.h> > > > + > > > +# ifdef USE_AS_WCSRCHR > > > +# define CHAR_SIZE 4 > > > +# define VPBROADCAST vpbroadcastd > > > +# define VPCMP vpcmpd > > > +# define VPMINU vpminud > > > +# define VPTESTN vptestnmd > > > +# else > > > +# define CHAR_SIZE 1 > > > +# define VPBROADCAST vpbroadcastb > > > +# define VPCMP vpcmpb > > > +# define VPMINU vpminub > > > +# define VPTESTN vptestnmb > > > +# endif > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > + > > > +# if VEC_SIZE == 64 > > > +# define BLSMSK blsmskq > > > +# define BSR bsrq > > > +# define KMOV kmovq > > > +# define KOR korq > > > +# define KORTEST kortestq > > > +# define R8 r8 > > > +# define RAX rax > > > +# define RCX rcx > > > +# define RDX rdx > > > +# define SHR shrq > > > +# define TEXTSUFFIX evex512 > > > +# define VMM0 zmm16 > > > +# define VMM1 zmm17 > > > +# define VMM2 zmm18 > > > +# define VMM3 zmm19 > > > +# define VMM4 zmm20 > > > +# define VMM5 zmm21 > > > +# define VMOVA vmovdqa64 > > > +# define VMOVU vmovdqu64 > > > + > > > +# elif VEC_SIZE == 32 > > > +/* Currently Unused. */ > > > +# define BLSMSK blsmskl > > > +# define BSR bsrl > > > +# define KMOV kmovd > > > +# define KOR kord > > > +# define KORTEST kortestd > > > +# define R8 r8d > > > +# define RAX eax > > > +# define RCX ecx > > > +# define RDX edx > > > +# define SHR shrl > > > +# define TEXTSUFFIX evex256 > > > +# define VMM0 ymm16 > > > +# define VMM1 ymm17 > > > +# define VMM2 ymm18 > > > +# define VMM3 ymm19 > > > +# define VMM4 ymm20 > > > +# define VMM5 ymm21 > > > +# define VMOVA vmovdqa32 > > > +# define VMOVU vmovdqu32 > > > +# endif > > > + > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > +/* Aligning entry point to 64 byte, provides better performance for > > > + one vector length string. */ > > > +ENTRY_P2ALIGN (STRRCHR, 6) > > > + > > > + /* Broadcast CHAR to VMM0. */ > > > + VPBROADCAST %esi, %VMM0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > +L(page_cross_continue): > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > + VMOVU (%rdi), %VMM1 > > > + > > > + VPTESTN %VMM1, %VMM1, %k1 > > > + KMOV %k1, %RCX > > > + test %RCX, %RCX > > > + jz L(align_more) > > > + > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > + KMOV %k0, %RAX > > > + BLSMSK %RCX, %RCX > > > + and %RCX, %RAX > > > + jz L(ret) > > > + > > > + BSR %RAX, %RAX > > > +# ifdef USE_AS_WCSRCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > +L(ret): > > > + ret > > > + > > > +L(vector_x2_end): > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > + KMOV %k2, %RAX > > > + BLSMSK %RCX, %RCX > > > + and %RCX, %RAX > > > + jz L(vector_x1_ret) > > > + > > > + BSR %RAX, %RAX > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + /* Check the first vector at very last to look for match. */ > > > +L(vector_x1_ret): > > > + VPCMP $0, %VMM1, %VMM0, %k2 > > > + KMOV %k2, %RAX > > > + test %RAX, %RAX > > > + jz L(ret) > > > + > > > + BSR %RAX, %RAX > > > +# ifdef USE_AS_WCSRCHR > > > + leaq (%rsi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rsi, %rax > > > +# endif > > > + ret > > > + > > > +L(align_more): > > > + /* Zero r8 to store match result. */ > > > + xorq %r8, %r8 > > > + /* Save pointer of first vector, in case if no match found. */ > > > + movq %rdi, %rsi > > > + /* Align pointer to vector size. */ > > > + andq $-VEC_SIZE, %rdi > > > + /* Loop unroll 2 times for 2 vector loop. */ > > > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(vector_x2_end) > > > + > > > + /* Save pointer of second vector, in case if no match > > > + found. */ > > > + movq %rdi, %r9 > > > + /* Align address to VEC_SIZE * 2 for loop. */ > > > + andq $-(VEC_SIZE * 2), %rdi > > > + > > > + .p2align 4,,11 > > > +L(loop): > > > + /* 2 vector loop, as it provide better performance as compared > > > + to 4 vector loop. */ > > > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > > > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > > > + VPCMP $0, %VMM3, %VMM0, %k1 > > > + VPCMP $0, %VMM4, %VMM0, %k2 > > > + VPMINU %VMM3, %VMM4, %VMM5 > > > + VPTESTN %VMM5, %VMM5, %k0 > > > + KOR %k1, %k2, %k3 > > > + subq $-(VEC_SIZE * 2), %rdi > > > + /* If k0 and k3 zero, match and end of string not found. */ > > > + KORTEST %k0, %k3 > > > + jz L(loop) > > > + > > > + /* If k0 is non zero, end of string found. */ > > > + KORTEST %k0, %k0 > > > + jnz L(endloop) > > > + > > > + /* A match found, it need to be stored in r8 before loop > > > + continue. */ > > > + /* Check second vector first. */ > > > + KMOV %k2, %RDX > > > + test %RDX, %RDX > > > + jz L(loop_vec_x3_ret) > > > + > > > + BSR %RDX, %RDX > > > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > > > + jmp L(loop) > > > + > > > + /* If second vector doesn't have match, first vector must > > > + have match. */ > > > +L(loop_vec_x3_ret): > > > + KMOV %k1, %R8 > > > + BSR %R8, %R8 > > > +# ifdef USE_AS_WCSRCHR > > > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > > > +# else > > > + add %rdi, %r8 > > > +# endif > > > + jmp L(loop) > > > + > > > +L(endloop): > > > + /* Check if string end in first loop vector. */ > > > + VPTESTN %VMM3, %VMM3, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(vector_x3_end) > > > + > > > + /* Check if it has match in first loop vector. */ > > > + KMOV %k1, %RAX > > > + test %RAX, %RAX > > > + jz L(vector_x4_end) > > > + > > > + BSR %RAX, %RAX > > > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > > > + > > > + /* String must end in second loop vector. */ > > > +L(vector_x4_end): > > > + VPTESTN %VMM4, %VMM4, %k0 > > > + KMOV %k0, %RCX > > > + KMOV %k2, %RAX > > > + BLSMSK %RCX, %RCX > > > + /* Check if it has match in second loop vector. */ > > > + and %RCX, %RAX > > > + jz L(check_last_match) > > > + > > > + BSR %RAX, %RAX > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + /* String end in first loop vector. */ > > > +L(vector_x3_end): > > > + KMOV %k1, %RAX > > > + BLSMSK %RCX, %RCX > > > + /* Check if it has match in second loop vector. */ > > > + and %RCX, %RAX > > > + jz L(check_last_match) > > > + > > > + BSR %RAX, %RAX > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > + /* No match in first and second loop vector. */ > > > +L(check_last_match): > > > + /* Check if any match recorded in r8. */ > > > + test %r8, %r8 > > > + jz L(vector_x2_ret) > > > + movq %r8, %rax > > > + ret > > > + > > > + /* No match recorded in r8. Check the second saved vector > > > + in begining. */ > > > +L(vector_x2_ret): > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > + KMOV %k2, %RAX > > > + test %RAX, %RAX > > > + jz L(vector_x1_ret) > > > + > > > + /* Match found in the second saved vector. */ > > > + BSR %RAX, %RAX > > > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > > > + ret > > > + > > > +L(page_cross): > > > + movl %eax, %ecx > > > +# ifdef USE_AS_WCSRCHR > > > + /* Calculate number of compare result bits to be skipped for > > > + wide string alignment adjustment. */ > > > + andl $(VEC_SIZE - 1), %ecx > > > + sarl $2, %ecx > > > +# endif > > > + /* ecx contains number of w[char] to be skipped as a result > > > + of address alignment. */ > > > + xorq %rdi, %rax > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > + > > > + VPTESTN %VMM1, %VMM1, %k1 > > > + KMOV %k1, %RAX > > > + SHR %cl, %RAX > > > + jz L(page_cross_continue) > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > + KMOV %k0, %RDX > > > + SHR %cl, %RDX > > > + BLSMSK %RAX, %RAX > > > + and %RDX, %RAX > > > + jz L(ret) > > > + BSR %RAX, %RAX > > > +# ifdef USE_AS_WCSRCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > + > > > + ret > > > +END (STRRCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > new file mode 100644 > > > index 0000000000..f880848e09 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +# ifndef STRRCHR > > > +# define STRRCHR __strrchr_evex512 > > > +# endif > > > + > > > +#define VEC_SIZE 64 > > > + > > > +#include "strrchr-evex-base.S" > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > new file mode 100644 > > > index 0000000000..65b7710b22 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > @@ -0,0 +1,8 @@ > > > +#ifndef WCSRCHR > > > +# define WCSRCHR __wcsrchr_evex512 > > > +#endif > > > + > > > +#define STRRCHR WCSRCHR > > > +#define USE_AS_WCSRCHR 1 > > > + > > > +#include "strrchr-evex512.S" > > > -- > > > 2.36.1 > > > ping
On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > This patch implements following evex512 version of string functions. > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > depending on length and alignment. > > > > > > > > > > Please attach benchmark numbers. > > > > > > > - strrchr function using 512 bit vectors. > > > > - wcsrchr function using 512 bit vectors. > > > > > > > > Code size data: > > > > > > > > strrchr-evex.o 833 byte > > > > strrchr-evex512.o 573 byte (-31%) > > > > > > > > wcsrchr-evex.o 836 byte > > > > wcsrchr-evex512.o 581 byte (-31%) > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 2 + > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > > > > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > > > > 5 files changed, 334 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index df4601c294..6a275f1c3d 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -110,6 +110,7 @@ sysdep_routines += \ > > > > strrchr-avx2 \ > > > > strrchr-avx2-rtm \ > > > > strrchr-evex \ > > > > + strrchr-evex512 \ > > > > strrchr-sse2 \ > > > > strspn-sse4 \ > > > > strstr-avx512 \ > > > > @@ -152,6 +153,7 @@ sysdep_routines += \ > > > > wcsrchr-avx2 \ > > > > wcsrchr-avx2-rtm \ > > > > wcsrchr-evex \ > > > > + wcsrchr-evex512 \ > > > > wcsrchr-sse2 \ > > > > wmemchr-avx2 \ > > > > wmemchr-avx2-rtm \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index a71444eccb..26c941023a 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > __strrchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __strrchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > > > > CPU_FEATURE_USABLE (AVX2), > > > > __strrchr_avx2) > > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __wcsrchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > + __wcsrchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > > > > CPU_FEATURE_USABLE (AVX2), > > > > __wcsrchr_avx2) > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > new file mode 100644 > > > > index 0000000000..e937cb193c > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > @@ -0,0 +1,307 @@ > > > > +/* Placeholder function, not used by any processor at the moment. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifdef USE_AS_WCSRCHR > > > > +# define CHAR_SIZE 4 > > > > +# define VPBROADCAST vpbroadcastd > > > > +# define VPCMP vpcmpd > > > > +# define VPMINU vpminud > > > > +# define VPTESTN vptestnmd > > > > +# else > > > > +# define CHAR_SIZE 1 > > > > +# define VPBROADCAST vpbroadcastb > > > > +# define VPCMP vpcmpb > > > > +# define VPMINU vpminub > > > > +# define VPTESTN vptestnmb > > > > +# endif > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > + > > > > +# if VEC_SIZE == 64 > > > > +# define BLSMSK blsmskq > > > > +# define BSR bsrq > > > > +# define KMOV kmovq > > > > +# define KOR korq > > > > +# define KORTEST kortestq > > > > +# define R8 r8 > > > > +# define RAX rax > > > > +# define RCX rcx > > > > +# define RDX rdx > > > > +# define SHR shrq > > > > +# define TEXTSUFFIX evex512 > > > > +# define VMM0 zmm16 > > > > +# define VMM1 zmm17 > > > > +# define VMM2 zmm18 > > > > +# define VMM3 zmm19 > > > > +# define VMM4 zmm20 > > > > +# define VMM5 zmm21 > > > > +# define VMOVA vmovdqa64 > > > > +# define VMOVU vmovdqu64 > > > > + > > > > +# elif VEC_SIZE == 32 > > > > +/* Currently Unused. */ > > > > +# define BLSMSK blsmskl > > > > +# define BSR bsrl > > > > +# define KMOV kmovd > > > > +# define KOR kord > > > > +# define KORTEST kortestd > > > > +# define R8 r8d > > > > +# define RAX eax > > > > +# define RCX ecx > > > > +# define RDX edx > > > > +# define SHR shrl > > > > +# define TEXTSUFFIX evex256 > > > > +# define VMM0 ymm16 > > > > +# define VMM1 ymm17 > > > > +# define VMM2 ymm18 > > > > +# define VMM3 ymm19 > > > > +# define VMM4 ymm20 > > > > +# define VMM5 ymm21 > > > > +# define VMOVA vmovdqa32 > > > > +# define VMOVU vmovdqu32 > > > > +# endif > > > > + > > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > + one vector length string. */ > > > > +ENTRY_P2ALIGN (STRRCHR, 6) > > > > + > > > > + /* Broadcast CHAR to VMM0. */ > > > > + VPBROADCAST %esi, %VMM0 > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > +L(page_cross_continue): > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > + VMOVU (%rdi), %VMM1 > > > > + > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > + KMOV %k1, %RCX > > > > + test %RCX, %RCX > > > > + jz L(align_more) > > > > + > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > + KMOV %k0, %RAX > > > > + BLSMSK %RCX, %RCX > > > > + and %RCX, %RAX > > > > + jz L(ret) > > > > + > > > > + BSR %RAX, %RAX > > > > +# ifdef USE_AS_WCSRCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > +L(ret): > > > > + ret > > > > + > > > > +L(vector_x2_end): > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > + KMOV %k2, %RAX > > > > + BLSMSK %RCX, %RCX > > > > + and %RCX, %RAX > > > > + jz L(vector_x1_ret) > > > > + > > > > + BSR %RAX, %RAX > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + /* Check the first vector at very last to look for match. */ > > > > +L(vector_x1_ret): > > > > + VPCMP $0, %VMM1, %VMM0, %k2 > > > > + KMOV %k2, %RAX > > > > + test %RAX, %RAX > > > > + jz L(ret) > > > > + > > > > + BSR %RAX, %RAX > > > > +# ifdef USE_AS_WCSRCHR > > > > + leaq (%rsi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rsi, %rax > > > > +# endif > > > > + ret > > > > + > > > > +L(align_more): > > > > + /* Zero r8 to store match result. */ > > > > + xorq %r8, %r8 > > > > + /* Save pointer of first vector, in case if no match found. */ > > > > + movq %rdi, %rsi > > > > + /* Align pointer to vector size. */ > > > > + andq $-VEC_SIZE, %rdi > > > > + /* Loop unroll 2 times for 2 vector loop. */ > > > > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(vector_x2_end) > > > > + > > > > + /* Save pointer of second vector, in case if no match > > > > + found. */ > > > > + movq %rdi, %r9 > > > > + /* Align address to VEC_SIZE * 2 for loop. */ > > > > + andq $-(VEC_SIZE * 2), %rdi > > > > + > > > > + .p2align 4,,11 > > > > +L(loop): > > > > + /* 2 vector loop, as it provide better performance as compared > > > > + to 4 vector loop. */ > > > > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > > > > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > > > > + VPCMP $0, %VMM3, %VMM0, %k1 > > > > + VPCMP $0, %VMM4, %VMM0, %k2 > > > > + VPMINU %VMM3, %VMM4, %VMM5 > > > > + VPTESTN %VMM5, %VMM5, %k0 > > > > + KOR %k1, %k2, %k3 > > > > + subq $-(VEC_SIZE * 2), %rdi > > > > + /* If k0 and k3 zero, match and end of string not found. */ > > > > + KORTEST %k0, %k3 > > > > + jz L(loop) > > > > + > > > > + /* If k0 is non zero, end of string found. */ > > > > + KORTEST %k0, %k0 > > > > + jnz L(endloop) > > > > + > > > > + /* A match found, it need to be stored in r8 before loop > > > > + continue. */ > > > > + /* Check second vector first. */ > > > > + KMOV %k2, %RDX > > > > + test %RDX, %RDX > > > > + jz L(loop_vec_x3_ret) > > > > + > > > > + BSR %RDX, %RDX > > > > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > > > > + jmp L(loop) > > > > + > > > > + /* If second vector doesn't have match, first vector must > > > > + have match. */ > > > > +L(loop_vec_x3_ret): > > > > + KMOV %k1, %R8 > > > > + BSR %R8, %R8 > > > > +# ifdef USE_AS_WCSRCHR > > > > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > > > > +# else > > > > + add %rdi, %r8 > > > > +# endif > > > > + jmp L(loop) > > > > + > > > > +L(endloop): > > > > + /* Check if string end in first loop vector. */ > > > > + VPTESTN %VMM3, %VMM3, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(vector_x3_end) > > > > + > > > > + /* Check if it has match in first loop vector. */ > > > > + KMOV %k1, %RAX > > > > + test %RAX, %RAX > > > > + jz L(vector_x4_end) > > > > + > > > > + BSR %RAX, %RAX > > > > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > > > > + > > > > + /* String must end in second loop vector. */ > > > > +L(vector_x4_end): > > > > + VPTESTN %VMM4, %VMM4, %k0 > > > > + KMOV %k0, %RCX > > > > + KMOV %k2, %RAX > > > > + BLSMSK %RCX, %RCX > > > > + /* Check if it has match in second loop vector. */ > > > > + and %RCX, %RAX > > > > + jz L(check_last_match) > > > > + > > > > + BSR %RAX, %RAX > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + /* String end in first loop vector. */ > > > > +L(vector_x3_end): > > > > + KMOV %k1, %RAX > > > > + BLSMSK %RCX, %RCX > > > > + /* Check if it has match in second loop vector. */ > > > > + and %RCX, %RAX > > > > + jz L(check_last_match) > > > > + > > > > + BSR %RAX, %RAX > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > + /* No match in first and second loop vector. */ > > > > +L(check_last_match): > > > > + /* Check if any match recorded in r8. */ > > > > + test %r8, %r8 > > > > + jz L(vector_x2_ret) > > > > + movq %r8, %rax > > > > + ret > > > > + > > > > + /* No match recorded in r8. Check the second saved vector > > > > + in begining. */ > > > > +L(vector_x2_ret): > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > + KMOV %k2, %RAX > > > > + test %RAX, %RAX > > > > + jz L(vector_x1_ret) > > > > + > > > > + /* Match found in the second saved vector. */ > > > > + BSR %RAX, %RAX > > > > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > > > > + ret > > > > + > > > > +L(page_cross): > > > > + movl %eax, %ecx > > > > +# ifdef USE_AS_WCSRCHR > > > > + /* Calculate number of compare result bits to be skipped for > > > > + wide string alignment adjustment. */ > > > > + andl $(VEC_SIZE - 1), %ecx > > > > + sarl $2, %ecx > > > > +# endif > > > > + /* ecx contains number of w[char] to be skipped as a result > > > > + of address alignment. */ > > > > + xorq %rdi, %rax > > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > > + > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > + KMOV %k1, %RAX > > > > + SHR %cl, %RAX > > > > + jz L(page_cross_continue) > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > + KMOV %k0, %RDX > > > > + SHR %cl, %RDX > > > > + BLSMSK %RAX, %RAX > > > > + and %RDX, %RAX > > > > + jz L(ret) > > > > + BSR %RAX, %RAX > > > > +# ifdef USE_AS_WCSRCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > + > > > > + ret > > > > +END (STRRCHR) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..f880848e09 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +# ifndef STRRCHR > > > > +# define STRRCHR __strrchr_evex512 > > > > +# endif > > > > + > > > > +#define VEC_SIZE 64 > > > > + > > > > +#include "strrchr-evex-base.S" > > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..65b7710b22 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +#ifndef WCSRCHR > > > > +# define WCSRCHR __wcsrchr_evex512 > > > > +#endif > > > > + > > > > +#define STRRCHR WCSRCHR > > > > +#define USE_AS_WCSRCHR 1 > > > > + > > > > +#include "strrchr-evex512.S" > > > > -- > > > > 2.36.1 > > > > > > ping Regarding this patch along with the corresponding memchr and strchr ones, I would prefer to try and implement the ZMM version alongside the YMM similar to what we do in memset/memmove. Since all/nearly all of the instructions are the same this shouldn't be too difficult with the `VEC(n)` macros. Examples are: https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512 and there is a congruent patch to strlen to do the same (still in the works): https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512 There are many good ideas in these patches that I believe would also apply to the YMM implementations and think it would be best to ensure both files are as close to optimal as we can get them as opposed to adding yet another bespoke implementation we need to maintain / keep optimized. Can you try and integrate this and the memchr/strchr implementations similar to how we do memmove/memset?
On Wed, Sep 28, 2022 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > > > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > This patch implements following evex512 version of string functions. > > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > > depending on length and alignment. > > > > > > > > > > > > > Please attach benchmark numbers. > > > > > > > > > - strrchr function using 512 bit vectors. > > > > > - wcsrchr function using 512 bit vectors. > > > > > > > > > > Code size data: > > > > > > > > > > strrchr-evex.o 833 byte > > > > > strrchr-evex512.o 573 byte (-31%) > > > > > > > > > > wcsrchr-evex.o 836 byte > > > > > wcsrchr-evex512.o 581 byte (-31%) > > > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > > --- > > > > > sysdeps/x86_64/multiarch/Makefile | 2 + > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > > > > > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > > > > > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > > > > > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > > > > > 5 files changed, 334 insertions(+) > > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > > index df4601c294..6a275f1c3d 100644 > > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > > @@ -110,6 +110,7 @@ sysdep_routines += \ > > > > > strrchr-avx2 \ > > > > > strrchr-avx2-rtm \ > > > > > strrchr-evex \ > > > > > + strrchr-evex512 \ > > > > > strrchr-sse2 \ > > > > > strspn-sse4 \ > > > > > strstr-avx512 \ > > > > > @@ -152,6 +153,7 @@ sysdep_routines += \ > > > > > wcsrchr-avx2 \ > > > > > wcsrchr-avx2-rtm \ > > > > > wcsrchr-evex \ > > > > > + wcsrchr-evex512 \ > > > > > wcsrchr-sse2 \ > > > > > wmemchr-avx2 \ > > > > > wmemchr-avx2-rtm \ > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > index a71444eccb..26c941023a 100644 > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > > __strrchr_evex) > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > + __strrchr_evex512) > > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > > > > > CPU_FEATURE_USABLE (AVX2), > > > > > __strrchr_avx2) > > > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > > __wcsrchr_evex) > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > + __wcsrchr_evex512) > > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > > > > > CPU_FEATURE_USABLE (AVX2), > > > > > __wcsrchr_avx2) > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > new file mode 100644 > > > > > index 0000000000..e937cb193c > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > @@ -0,0 +1,307 @@ > > > > > +/* Placeholder function, not used by any processor at the moment. > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > + This file is part of the GNU C Library. > > > > > + > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > + License as published by the Free Software Foundation; either > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > + > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > + Lesser General Public License for more details. > > > > > + > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > + License along with the GNU C Library; if not, see > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > + > > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > > + > > > > > +#include <isa-level.h> > > > > > + > > > > > +#if ISA_SHOULD_BUILD (4) > > > > > + > > > > > +# include <sysdep.h> > > > > > + > > > > > +# ifdef USE_AS_WCSRCHR > > > > > +# define CHAR_SIZE 4 > > > > > +# define VPBROADCAST vpbroadcastd > > > > > +# define VPCMP vpcmpd > > > > > +# define VPMINU vpminud > > > > > +# define VPTESTN vptestnmd > > > > > +# else > > > > > +# define CHAR_SIZE 1 > > > > > +# define VPBROADCAST vpbroadcastb > > > > > +# define VPCMP vpcmpb > > > > > +# define VPMINU vpminub > > > > > +# define VPTESTN vptestnmb > > > > > +# endif > > > > > + > > > > > +# define PAGE_SIZE 4096 > > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > + > > > > > +# if VEC_SIZE == 64 > > > > > +# define BLSMSK blsmskq > > > > > +# define BSR bsrq > > > > > +# define KMOV kmovq > > > > > +# define KOR korq > > > > > +# define KORTEST kortestq > > > > > +# define R8 r8 > > > > > +# define RAX rax > > > > > +# define RCX rcx > > > > > +# define RDX rdx > > > > > +# define SHR shrq > > > > > +# define TEXTSUFFIX evex512 > > > > > +# define VMM0 zmm16 > > > > > +# define VMM1 zmm17 > > > > > +# define VMM2 zmm18 > > > > > +# define VMM3 zmm19 > > > > > +# define VMM4 zmm20 > > > > > +# define VMM5 zmm21 > > > > > +# define VMOVA vmovdqa64 > > > > > +# define VMOVU vmovdqu64 > > > > > + > > > > > +# elif VEC_SIZE == 32 > > > > > +/* Currently Unused. */ > > > > > +# define BLSMSK blsmskl > > > > > +# define BSR bsrl > > > > > +# define KMOV kmovd > > > > > +# define KOR kord > > > > > +# define KORTEST kortestd > > > > > +# define R8 r8d > > > > > +# define RAX eax > > > > > +# define RCX ecx > > > > > +# define RDX edx > > > > > +# define SHR shrl > > > > > +# define TEXTSUFFIX evex256 > > > > > +# define VMM0 ymm16 > > > > > +# define VMM1 ymm17 > > > > > +# define VMM2 ymm18 > > > > > +# define VMM3 ymm19 > > > > > +# define VMM4 ymm20 > > > > > +# define VMM5 ymm21 > > > > > +# define VMOVA vmovdqa32 > > > > > +# define VMOVU vmovdqu32 > > > > > +# endif > > > > > + > > > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > > + one vector length string. */ > > > > > +ENTRY_P2ALIGN (STRRCHR, 6) > > > > > + > > > > > + /* Broadcast CHAR to VMM0. */ > > > > > + VPBROADCAST %esi, %VMM0 > > > > > + movl %edi, %eax > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > + ja L(page_cross) > > > > > + > > > > > +L(page_cross_continue): > > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > > + VMOVU (%rdi), %VMM1 > > > > > + > > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > > + KMOV %k1, %RCX > > > > > + test %RCX, %RCX > > > > > + jz L(align_more) > > > > > + > > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > > + KMOV %k0, %RAX > > > > > + BLSMSK %RCX, %RCX > > > > > + and %RCX, %RAX > > > > > + jz L(ret) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > +# ifdef USE_AS_WCSRCHR > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > +# else > > > > > + add %rdi, %rax > > > > > +# endif > > > > > +L(ret): > > > > > + ret > > > > > + > > > > > +L(vector_x2_end): > > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > > + KMOV %k2, %RAX > > > > > + BLSMSK %RCX, %RCX > > > > > + and %RCX, %RAX > > > > > + jz L(vector_x1_ret) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > + /* Check the first vector at very last to look for match. */ > > > > > +L(vector_x1_ret): > > > > > + VPCMP $0, %VMM1, %VMM0, %k2 > > > > > + KMOV %k2, %RAX > > > > > + test %RAX, %RAX > > > > > + jz L(ret) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > +# ifdef USE_AS_WCSRCHR > > > > > + leaq (%rsi, %rax, CHAR_SIZE), %rax > > > > > +# else > > > > > + add %rsi, %rax > > > > > +# endif > > > > > + ret > > > > > + > > > > > +L(align_more): > > > > > + /* Zero r8 to store match result. */ > > > > > + xorq %r8, %r8 > > > > > + /* Save pointer of first vector, in case if no match found. */ > > > > > + movq %rdi, %rsi > > > > > + /* Align pointer to vector size. */ > > > > > + andq $-VEC_SIZE, %rdi > > > > > + /* Loop unroll 2 times for 2 vector loop. */ > > > > > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > > + KMOV %k0, %RCX > > > > > + test %RCX, %RCX > > > > > + jnz L(vector_x2_end) > > > > > + > > > > > + /* Save pointer of second vector, in case if no match > > > > > + found. */ > > > > > + movq %rdi, %r9 > > > > > + /* Align address to VEC_SIZE * 2 for loop. */ > > > > > + andq $-(VEC_SIZE * 2), %rdi > > > > > + > > > > > + .p2align 4,,11 > > > > > +L(loop): > > > > > + /* 2 vector loop, as it provide better performance as compared > > > > > + to 4 vector loop. */ > > > > > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > > > > > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > > > > > + VPCMP $0, %VMM3, %VMM0, %k1 > > > > > + VPCMP $0, %VMM4, %VMM0, %k2 > > > > > + VPMINU %VMM3, %VMM4, %VMM5 > > > > > + VPTESTN %VMM5, %VMM5, %k0 > > > > > + KOR %k1, %k2, %k3 > > > > > + subq $-(VEC_SIZE * 2), %rdi > > > > > + /* If k0 and k3 zero, match and end of string not found. */ > > > > > + KORTEST %k0, %k3 > > > > > + jz L(loop) > > > > > + > > > > > + /* If k0 is non zero, end of string found. */ > > > > > + KORTEST %k0, %k0 > > > > > + jnz L(endloop) > > > > > + > > > > > + /* A match found, it need to be stored in r8 before loop > > > > > + continue. */ > > > > > + /* Check second vector first. */ > > > > > + KMOV %k2, %RDX > > > > > + test %RDX, %RDX > > > > > + jz L(loop_vec_x3_ret) > > > > > + > > > > > + BSR %RDX, %RDX > > > > > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > > > > > + jmp L(loop) > > > > > + > > > > > + /* If second vector doesn't have match, first vector must > > > > > + have match. */ > > > > > +L(loop_vec_x3_ret): > > > > > + KMOV %k1, %R8 > > > > > + BSR %R8, %R8 > > > > > +# ifdef USE_AS_WCSRCHR > > > > > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > > > > > +# else > > > > > + add %rdi, %r8 > > > > > +# endif > > > > > + jmp L(loop) > > > > > + > > > > > +L(endloop): > > > > > + /* Check if string end in first loop vector. */ > > > > > + VPTESTN %VMM3, %VMM3, %k0 > > > > > + KMOV %k0, %RCX > > > > > + test %RCX, %RCX > > > > > + jnz L(vector_x3_end) > > > > > + > > > > > + /* Check if it has match in first loop vector. */ > > > > > + KMOV %k1, %RAX > > > > > + test %RAX, %RAX > > > > > + jz L(vector_x4_end) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > > > > > + > > > > > + /* String must end in second loop vector. */ > > > > > +L(vector_x4_end): > > > > > + VPTESTN %VMM4, %VMM4, %k0 > > > > > + KMOV %k0, %RCX > > > > > + KMOV %k2, %RAX > > > > > + BLSMSK %RCX, %RCX > > > > > + /* Check if it has match in second loop vector. */ > > > > > + and %RCX, %RAX > > > > > + jz L(check_last_match) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > + /* String end in first loop vector. */ > > > > > +L(vector_x3_end): > > > > > + KMOV %k1, %RAX > > > > > + BLSMSK %RCX, %RCX > > > > > + /* Check if it has match in second loop vector. */ > > > > > + and %RCX, %RAX > > > > > + jz L(check_last_match) > > > > > + > > > > > + BSR %RAX, %RAX > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > + /* No match in first and second loop vector. */ > > > > > +L(check_last_match): > > > > > + /* Check if any match recorded in r8. */ > > > > > + test %r8, %r8 > > > > > + jz L(vector_x2_ret) > > > > > + movq %r8, %rax > > > > > + ret > > > > > + > > > > > + /* No match recorded in r8. Check the second saved vector > > > > > + in begining. */ > > > > > +L(vector_x2_ret): > > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > > + KMOV %k2, %RAX > > > > > + test %RAX, %RAX > > > > > + jz L(vector_x1_ret) > > > > > + > > > > > + /* Match found in the second saved vector. */ > > > > > + BSR %RAX, %RAX > > > > > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > > > > > + ret > > > > > + > > > > > +L(page_cross): > > > > > + movl %eax, %ecx > > > > > +# ifdef USE_AS_WCSRCHR > > > > > + /* Calculate number of compare result bits to be skipped for > > > > > + wide string alignment adjustment. */ > > > > > + andl $(VEC_SIZE - 1), %ecx > > > > > + sarl $2, %ecx > > > > > +# endif > > > > > + /* ecx contains number of w[char] to be skipped as a result > > > > > + of address alignment. */ > > > > > + xorq %rdi, %rax > > > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > > > + > > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > > + KMOV %k1, %RAX > > > > > + SHR %cl, %RAX > > > > > + jz L(page_cross_continue) > > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > > + KMOV %k0, %RDX > > > > > + SHR %cl, %RDX > > > > > + BLSMSK %RAX, %RAX > > > > > + and %RDX, %RAX > > > > > + jz L(ret) > > > > > + BSR %RAX, %RAX > > > > > +# ifdef USE_AS_WCSRCHR > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > +# else > > > > > + add %rdi, %rax > > > > > +# endif > > > > > + > > > > > + ret > > > > > +END (STRRCHR) > > > > > +#endif > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > new file mode 100644 > > > > > index 0000000000..f880848e09 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > @@ -0,0 +1,7 @@ > > > > > +# ifndef STRRCHR > > > > > +# define STRRCHR __strrchr_evex512 > > > > > +# endif > > > > > + > > > > > +#define VEC_SIZE 64 > > > > > + > > > > > +#include "strrchr-evex-base.S" > > > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > new file mode 100644 > > > > > index 0000000000..65b7710b22 > > > > > --- /dev/null > > > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > @@ -0,0 +1,8 @@ > > > > > +#ifndef WCSRCHR > > > > > +# define WCSRCHR __wcsrchr_evex512 > > > > > +#endif > > > > > + > > > > > +#define STRRCHR WCSRCHR > > > > > +#define USE_AS_WCSRCHR 1 > > > > > + > > > > > +#include "strrchr-evex512.S" > > > > > -- > > > > > 2.36.1 > > > > > > > > > ping > > Regarding this patch along with the corresponding memchr and strchr > ones, I would prefer to try and implement the ZMM version alongside > the YMM similar to what we do in memset/memmove. This is a question of methodology. Everyone has different ways to implement. I don't think it's fair to expect that everyone follows same existing methodology. > > Since all/nearly all of the instructions are the same this shouldn't > be too difficult with the `VEC(n)` macros. > VEC(n) uses 3 levels of extra indirection to simply understand what actual registers are used. memrchr-evex.S->evex256-vecs.h->evex-vecs-common.h->vec-macros.h > Examples are: > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512 > > and there is a congruent patch to strlen to do the same (still in the > works): > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512 > > There are many good ideas in these patches that I believe would also > apply to the YMM implementations and think it would be best to ensure > both files are as close to optimal as we can get them as opposed to > adding yet another bespoke implementation we need to maintain / keep > optimized. > I don't think it's a good idea to centralize when the entire ecosystem is moving towards modularization and inclusion. Also it will not encourage any new contributors, if good ideas taken from the patch and discard the actual patch just because it's using different implementation methodology. > Can you try and integrate this and the memchr/strchr implementations > similar to how we do memmove/memset? Why? I don't see any reason for that.
On Fri, Sep 30, 2022 at 11:49 AM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Wed, Sep 28, 2022 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > > > > > > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > > > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha > > > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > > > > > This patch implements following evex512 version of string functions. > > > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > > > depending on length and alignment. > > > > > > > > > > > > > > > > Please attach benchmark numbers. > > > > > > > > > > > - strrchr function using 512 bit vectors. > > > > > > - wcsrchr function using 512 bit vectors. > > > > > > > > > > > > Code size data: > > > > > > > > > > > > strrchr-evex.o 833 byte > > > > > > strrchr-evex512.o 573 byte (-31%) > > > > > > > > > > > > wcsrchr-evex.o 836 byte > > > > > > wcsrchr-evex512.o 581 byte (-31%) > > > > > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > > > --- > > > > > > sysdeps/x86_64/multiarch/Makefile | 2 + > > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + > > > > > > sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++ > > > > > > sysdeps/x86_64/multiarch/strrchr-evex512.S | 7 + > > > > > > sysdeps/x86_64/multiarch/wcsrchr-evex512.S | 8 + > > > > > > 5 files changed, 334 insertions(+) > > > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > > create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > > create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > > > index df4601c294..6a275f1c3d 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > > > @@ -110,6 +110,7 @@ sysdep_routines += \ > > > > > > strrchr-avx2 \ > > > > > > strrchr-avx2-rtm \ > > > > > > strrchr-evex \ > > > > > > + strrchr-evex512 \ > > > > > > strrchr-sse2 \ > > > > > > strspn-sse4 \ > > > > > > strstr-avx512 \ > > > > > > @@ -152,6 +153,7 @@ sysdep_routines += \ > > > > > > wcsrchr-avx2 \ > > > > > > wcsrchr-avx2-rtm \ > > > > > > wcsrchr-evex \ > > > > > > + wcsrchr-evex512 \ > > > > > > wcsrchr-sse2 \ > > > > > > wmemchr-avx2 \ > > > > > > wmemchr-avx2-rtm \ > > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > index a71444eccb..26c941023a 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > (CPU_FEATURE_USABLE (AVX512VL) > > > > > > && CPU_FEATURE_USABLE (AVX512BW)), > > > > > > __strrchr_evex) > > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, > > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > + __strrchr_evex512) > > > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, > > > > > > CPU_FEATURE_USABLE (AVX2), > > > > > > __strrchr_avx2) > > > > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > > > __wcsrchr_evex) > > > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, > > > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > > > + && CPU_FEATURE_USABLE (AVX512BW) > > > > > > + && CPU_FEATURE_USABLE (BMI2)), > > > > > > + __wcsrchr_evex512) > > > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, > > > > > > CPU_FEATURE_USABLE (AVX2), > > > > > > __wcsrchr_avx2) > > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > > new file mode 100644 > > > > > > index 0000000000..e937cb193c > > > > > > --- /dev/null > > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S > > > > > > @@ -0,0 +1,307 @@ > > > > > > +/* Placeholder function, not used by any processor at the moment. > > > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > > > + This file is part of the GNU C Library. > > > > > > + > > > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > > > + modify it under the terms of the GNU Lesser General Public > > > > > > + License as published by the Free Software Foundation; either > > > > > > + version 2.1 of the License, or (at your option) any later version. > > > > > > + > > > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > > > + Lesser General Public License for more details. > > > > > > + > > > > > > + You should have received a copy of the GNU Lesser General Public > > > > > > + License along with the GNU C Library; if not, see > > > > > > + <https://www.gnu.org/licenses/>. */ > > > > > > + > > > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > > > + > > > > > > +#include <isa-level.h> > > > > > > + > > > > > > +#if ISA_SHOULD_BUILD (4) > > > > > > + > > > > > > +# include <sysdep.h> > > > > > > + > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > +# define CHAR_SIZE 4 > > > > > > +# define VPBROADCAST vpbroadcastd > > > > > > +# define VPCMP vpcmpd > > > > > > +# define VPMINU vpminud > > > > > > +# define VPTESTN vptestnmd > > > > > > +# else > > > > > > +# define CHAR_SIZE 1 > > > > > > +# define VPBROADCAST vpbroadcastb > > > > > > +# define VPCMP vpcmpb > > > > > > +# define VPMINU vpminub > > > > > > +# define VPTESTN vptestnmb > > > > > > +# endif > > > > > > + > > > > > > +# define PAGE_SIZE 4096 > > > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > > + > > > > > > +# if VEC_SIZE == 64 > > > > > > +# define BLSMSK blsmskq > > > > > > +# define BSR bsrq > > > > > > +# define KMOV kmovq > > > > > > +# define KOR korq > > > > > > +# define KORTEST kortestq > > > > > > +# define R8 r8 > > > > > > +# define RAX rax > > > > > > +# define RCX rcx > > > > > > +# define RDX rdx > > > > > > +# define SHR shrq > > > > > > +# define TEXTSUFFIX evex512 > > > > > > +# define VMM0 zmm16 > > > > > > +# define VMM1 zmm17 > > > > > > +# define VMM2 zmm18 > > > > > > +# define VMM3 zmm19 > > > > > > +# define VMM4 zmm20 > > > > > > +# define VMM5 zmm21 > > > > > > +# define VMOVA vmovdqa64 > > > > > > +# define VMOVU vmovdqu64 > > > > > > + > > > > > > +# elif VEC_SIZE == 32 > > > > > > +/* Currently Unused. */ > > > > > > +# define BLSMSK blsmskl > > > > > > +# define BSR bsrl > > > > > > +# define KMOV kmovd > > > > > > +# define KOR kord > > > > > > +# define KORTEST kortestd > > > > > > +# define R8 r8d > > > > > > +# define RAX eax > > > > > > +# define RCX ecx > > > > > > +# define RDX edx > > > > > > +# define SHR shrl > > > > > > +# define TEXTSUFFIX evex256 > > > > > > +# define VMM0 ymm16 > > > > > > +# define VMM1 ymm17 > > > > > > +# define VMM2 ymm18 > > > > > > +# define VMM3 ymm19 > > > > > > +# define VMM4 ymm20 > > > > > > +# define VMM5 ymm21 > > > > > > +# define VMOVA vmovdqa32 > > > > > > +# define VMOVU vmovdqu32 > > > > > > +# endif > > > > > > + > > > > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > > > + one vector length string. */ > > > > > > +ENTRY_P2ALIGN (STRRCHR, 6) > > > > > > + > > > > > > + /* Broadcast CHAR to VMM0. */ > > > > > > + VPBROADCAST %esi, %VMM0 > > > > > > + movl %edi, %eax > > > > > > + andl $(PAGE_SIZE - 1), %eax > > > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > > > + ja L(page_cross) > > > > > > + > > > > > > +L(page_cross_continue): > > > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > > > + VMOVU (%rdi), %VMM1 > > > > > > + > > > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > > > + KMOV %k1, %RCX > > > > > > + test %RCX, %RCX > > > > > > + jz L(align_more) > > > > > > + > > > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > > > + KMOV %k0, %RAX > > > > > > + BLSMSK %RCX, %RCX > > > > > > + and %RCX, %RAX > > > > > > + jz L(ret) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > > +# else > > > > > > + add %rdi, %rax > > > > > > +# endif > > > > > > +L(ret): > > > > > > + ret > > > > > > + > > > > > > +L(vector_x2_end): > > > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > > > + KMOV %k2, %RAX > > > > > > + BLSMSK %RCX, %RCX > > > > > > + and %RCX, %RAX > > > > > > + jz L(vector_x1_ret) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > > > + ret > > > > > > + > > > > > > + /* Check the first vector at very last to look for match. */ > > > > > > +L(vector_x1_ret): > > > > > > + VPCMP $0, %VMM1, %VMM0, %k2 > > > > > > + KMOV %k2, %RAX > > > > > > + test %RAX, %RAX > > > > > > + jz L(ret) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > + leaq (%rsi, %rax, CHAR_SIZE), %rax > > > > > > +# else > > > > > > + add %rsi, %rax > > > > > > +# endif > > > > > > + ret > > > > > > + > > > > > > +L(align_more): > > > > > > + /* Zero r8 to store match result. */ > > > > > > + xorq %r8, %r8 > > > > > > + /* Save pointer of first vector, in case if no match found. */ > > > > > > + movq %rdi, %rsi > > > > > > + /* Align pointer to vector size. */ > > > > > > + andq $-VEC_SIZE, %rdi > > > > > > + /* Loop unroll 2 times for 2 vector loop. */ > > > > > > + VMOVA (VEC_SIZE)(%rdi), %VMM2 > > > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > > > + KMOV %k0, %RCX > > > > > > + test %RCX, %RCX > > > > > > + jnz L(vector_x2_end) > > > > > > + > > > > > > + /* Save pointer of second vector, in case if no match > > > > > > + found. */ > > > > > > + movq %rdi, %r9 > > > > > > + /* Align address to VEC_SIZE * 2 for loop. */ > > > > > > + andq $-(VEC_SIZE * 2), %rdi > > > > > > + > > > > > > + .p2align 4,,11 > > > > > > +L(loop): > > > > > > + /* 2 vector loop, as it provide better performance as compared > > > > > > + to 4 vector loop. */ > > > > > > + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 > > > > > > + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 > > > > > > + VPCMP $0, %VMM3, %VMM0, %k1 > > > > > > + VPCMP $0, %VMM4, %VMM0, %k2 > > > > > > + VPMINU %VMM3, %VMM4, %VMM5 > > > > > > + VPTESTN %VMM5, %VMM5, %k0 > > > > > > + KOR %k1, %k2, %k3 > > > > > > + subq $-(VEC_SIZE * 2), %rdi > > > > > > + /* If k0 and k3 zero, match and end of string not found. */ > > > > > > + KORTEST %k0, %k3 > > > > > > + jz L(loop) > > > > > > + > > > > > > + /* If k0 is non zero, end of string found. */ > > > > > > + KORTEST %k0, %k0 > > > > > > + jnz L(endloop) > > > > > > + > > > > > > + /* A match found, it need to be stored in r8 before loop > > > > > > + continue. */ > > > > > > + /* Check second vector first. */ > > > > > > + KMOV %k2, %RDX > > > > > > + test %RDX, %RDX > > > > > > + jz L(loop_vec_x3_ret) > > > > > > + > > > > > > + BSR %RDX, %RDX > > > > > > + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 > > > > > > + jmp L(loop) > > > > > > + > > > > > > + /* If second vector doesn't have match, first vector must > > > > > > + have match. */ > > > > > > +L(loop_vec_x3_ret): > > > > > > + KMOV %k1, %R8 > > > > > > + BSR %R8, %R8 > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > + leaq (%rdi, %r8, CHAR_SIZE), %r8 > > > > > > +# else > > > > > > + add %rdi, %r8 > > > > > > +# endif > > > > > > + jmp L(loop) > > > > > > + > > > > > > +L(endloop): > > > > > > + /* Check if string end in first loop vector. */ > > > > > > + VPTESTN %VMM3, %VMM3, %k0 > > > > > > + KMOV %k0, %RCX > > > > > > + test %RCX, %RCX > > > > > > + jnz L(vector_x3_end) > > > > > > + > > > > > > + /* Check if it has match in first loop vector. */ > > > > > > + KMOV %k1, %RAX > > > > > > + test %RAX, %RAX > > > > > > + jz L(vector_x4_end) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %r8 > > > > > > + > > > > > > + /* String must end in second loop vector. */ > > > > > > +L(vector_x4_end): > > > > > > + VPTESTN %VMM4, %VMM4, %k0 > > > > > > + KMOV %k0, %RCX > > > > > > + KMOV %k2, %RAX > > > > > > + BLSMSK %RCX, %RCX > > > > > > + /* Check if it has match in second loop vector. */ > > > > > > + and %RCX, %RAX > > > > > > + jz L(check_last_match) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax > > > > > > + ret > > > > > > + > > > > > > + /* String end in first loop vector. */ > > > > > > +L(vector_x3_end): > > > > > > + KMOV %k1, %RAX > > > > > > + BLSMSK %RCX, %RCX > > > > > > + /* Check if it has match in second loop vector. */ > > > > > > + and %RCX, %RAX > > > > > > + jz L(check_last_match) > > > > > > + > > > > > > + BSR %RAX, %RAX > > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > > + ret > > > > > > + > > > > > > + /* No match in first and second loop vector. */ > > > > > > +L(check_last_match): > > > > > > + /* Check if any match recorded in r8. */ > > > > > > + test %r8, %r8 > > > > > > + jz L(vector_x2_ret) > > > > > > + movq %r8, %rax > > > > > > + ret > > > > > > + > > > > > > + /* No match recorded in r8. Check the second saved vector > > > > > > + in begining. */ > > > > > > +L(vector_x2_ret): > > > > > > + VPCMP $0, %VMM2, %VMM0, %k2 > > > > > > + KMOV %k2, %RAX > > > > > > + test %RAX, %RAX > > > > > > + jz L(vector_x1_ret) > > > > > > + > > > > > > + /* Match found in the second saved vector. */ > > > > > > + BSR %RAX, %RAX > > > > > > + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax > > > > > > + ret > > > > > > + > > > > > > +L(page_cross): > > > > > > + movl %eax, %ecx > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > + /* Calculate number of compare result bits to be skipped for > > > > > > + wide string alignment adjustment. */ > > > > > > + andl $(VEC_SIZE - 1), %ecx > > > > > > + sarl $2, %ecx > > > > > > +# endif > > > > > > + /* ecx contains number of w[char] to be skipped as a result > > > > > > + of address alignment. */ > > > > > > + xorq %rdi, %rax > > > > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > > > > + > > > > > > + VPTESTN %VMM1, %VMM1, %k1 > > > > > > + KMOV %k1, %RAX > > > > > > + SHR %cl, %RAX > > > > > > + jz L(page_cross_continue) > > > > > > + VPCMP $0, %VMM1, %VMM0, %k0 > > > > > > + KMOV %k0, %RDX > > > > > > + SHR %cl, %RDX > > > > > > + BLSMSK %RAX, %RAX > > > > > > + and %RDX, %RAX > > > > > > + jz L(ret) > > > > > > + BSR %RAX, %RAX > > > > > > +# ifdef USE_AS_WCSRCHR > > > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > > > +# else > > > > > > + add %rdi, %rax > > > > > > +# endif > > > > > > + > > > > > > + ret > > > > > > +END (STRRCHR) > > > > > > +#endif > > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > > new file mode 100644 > > > > > > index 0000000000..f880848e09 > > > > > > --- /dev/null > > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S > > > > > > @@ -0,0 +1,7 @@ > > > > > > +# ifndef STRRCHR > > > > > > +# define STRRCHR __strrchr_evex512 > > > > > > +# endif > > > > > > + > > > > > > +#define VEC_SIZE 64 > > > > > > + > > > > > > +#include "strrchr-evex-base.S" > > > > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > new file mode 100644 > > > > > > index 0000000000..65b7710b22 > > > > > > --- /dev/null > > > > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S > > > > > > @@ -0,0 +1,8 @@ > > > > > > +#ifndef WCSRCHR > > > > > > +# define WCSRCHR __wcsrchr_evex512 > > > > > > +#endif > > > > > > + > > > > > > +#define STRRCHR WCSRCHR > > > > > > +#define USE_AS_WCSRCHR 1 > > > > > > + > > > > > > +#include "strrchr-evex512.S" > > > > > > -- > > > > > > 2.36.1 > > > > > > > > > > > > ping > > > > Regarding this patch along with the corresponding memchr and strchr > > ones, I would prefer to try and implement the ZMM version alongside > > the YMM similar to what we do in memset/memmove. > > This is a question of methodology. Everyone has different ways to > implement. I don't think it's fair to expect that everyone follows same > existing methodology. > > > > > Since all/nearly all of the instructions are the same this shouldn't > > be too difficult with the `VEC(n)` macros. > > > > VEC(n) uses 3 levels of extra indirection to simply understand what > actual registers are used. > > memrchr-evex.S->evex256-vecs.h->evex-vecs-common.h->vec-macros.h Imo it beats recopying the upcased GPR and VMM macros in each file. > > > Examples are: > > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512 > > > > and there is a congruent patch to strlen to do the same (still in the > > works): > > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512 > > > > There are many good ideas in these patches that I believe would also > > apply to the YMM implementations and think it would be best to ensure > > both files are as close to optimal as we can get them as opposed to > > adding yet another bespoke implementation we need to maintain / keep > > optimized. > > > > I don't think it's a good idea to centralize when the entire ecosystem is > moving towards modularization and inclusion. Reusing code promotes modularity. Tell me which is more modular? template<typename T> T max(T a, T b) { return a < b ? a : b; } or max_int(int a, int b) { .... } max_long(long a, long b) { ... } ? > > Also it will not encourage any new contributors, if good ideas > taken from the patch and discard the actual patch just because it's using > different implementation methodology. Is there a reason the evex512 implementation methodology doesn't suite evex256 or vice versa? They use just about the exact same instructions. Minus a few edge cases where the evex256 version combines 2x GPR for a bit-scan there seem to be few cases the two can't share logic. > > > Can you try and integrate this and the memchr/strchr implementations > > similar to how we do memmove/memset? > > Why? I don't see any reason for that. The reasons are above.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..6a275f1c3d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -110,6 +110,7 @@ sysdep_routines += \ strrchr-avx2 \ strrchr-avx2-rtm \ strrchr-evex \ + strrchr-evex512 \ strrchr-sse2 \ strspn-sse4 \ strstr-avx512 \ @@ -152,6 +153,7 @@ sysdep_routines += \ wcsrchr-avx2 \ wcsrchr-avx2-rtm \ wcsrchr-evex \ + wcsrchr-evex512 \ wcsrchr-sse2 \ wmemchr-avx2 \ wmemchr-avx2-rtm \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a71444eccb..26c941023a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strrchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strrchr_evex512) X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr, CPU_FEATURE_USABLE (AVX2), __strrchr_avx2) @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcsrchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsrchr_evex512) X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr, CPU_FEATURE_USABLE (AVX2), __wcsrchr_avx2) diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S new file mode 100644 index 0000000000..e937cb193c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -0,0 +1,307 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WCSRCHR +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define VPMINU vpminud +# define VPTESTN vptestnmd +# else +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define VPMINU vpminub +# define VPTESTN vptestnmb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# if VEC_SIZE == 64 +# define BLSMSK blsmskq +# define BSR bsrq +# define KMOV kmovq +# define KOR korq +# define KORTEST kortestq +# define R8 r8 +# define RAX rax +# define RCX rcx +# define RDX rdx +# define SHR shrq +# define TEXTSUFFIX evex512 +# define VMM0 zmm16 +# define VMM1 zmm17 +# define VMM2 zmm18 +# define VMM3 zmm19 +# define VMM4 zmm20 +# define VMM5 zmm21 +# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 + +# elif VEC_SIZE == 32 +/* Currently Unused. */ +# define BLSMSK blsmskl +# define BSR bsrl +# define KMOV kmovd +# define KOR kord +# define KORTEST kortestd +# define R8 r8d +# define RAX eax +# define RCX ecx +# define RDX edx +# define SHR shrl +# define TEXTSUFFIX evex256 +# define VMM0 ymm16 +# define VMM1 ymm17 +# define VMM2 ymm18 +# define VMM3 ymm19 +# define VMM4 ymm20 +# define VMM5 ymm21 +# define VMOVA vmovdqa32 +# define VMOVU vmovdqu32 +# endif + + .section .text.TEXTSUFFIX, "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (STRRCHR, 6) + + /* Broadcast CHAR to VMM0. */ + VPBROADCAST %esi, %VMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + +L(page_cross_continue): + /* Compare [w]char for null, mask bit will be set for match. */ + VMOVU (%rdi), %VMM1 + + VPTESTN %VMM1, %VMM1, %k1 + KMOV %k1, %RCX + test %RCX, %RCX + jz L(align_more) + + VPCMP $0, %VMM1, %VMM0, %k0 + KMOV %k0, %RAX + BLSMSK %RCX, %RCX + and %RCX, %RAX + jz L(ret) + + BSR %RAX, %RAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif +L(ret): + ret + +L(vector_x2_end): + VPCMP $0, %VMM2, %VMM0, %k2 + KMOV %k2, %RAX + BLSMSK %RCX, %RCX + and %RCX, %RAX + jz L(vector_x1_ret) + + BSR %RAX, %RAX + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* Check the first vector at very last to look for match. */ +L(vector_x1_ret): + VPCMP $0, %VMM1, %VMM0, %k2 + KMOV %k2, %RAX + test %RAX, %RAX + jz L(ret) + + BSR %RAX, %RAX +# ifdef USE_AS_WCSRCHR + leaq (%rsi, %rax, CHAR_SIZE), %rax +# else + add %rsi, %rax +# endif + ret + +L(align_more): + /* Zero r8 to store match result. */ + xorq %r8, %r8 + /* Save pointer of first vector, in case if no match found. */ + movq %rdi, %rsi + /* Align pointer to vector size. */ + andq $-VEC_SIZE, %rdi + /* Loop unroll 2 times for 2 vector loop. */ + VMOVA (VEC_SIZE)(%rdi), %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(vector_x2_end) + + /* Save pointer of second vector, in case if no match + found. */ + movq %rdi, %r9 + /* Align address to VEC_SIZE * 2 for loop. */ + andq $-(VEC_SIZE * 2), %rdi + + .p2align 4,,11 +L(loop): + /* 2 vector loop, as it provide better performance as compared + to 4 vector loop. */ + VMOVA (VEC_SIZE * 2)(%rdi), %VMM3 + VMOVA (VEC_SIZE * 3)(%rdi), %VMM4 + VPCMP $0, %VMM3, %VMM0, %k1 + VPCMP $0, %VMM4, %VMM0, %k2 + VPMINU %VMM3, %VMM4, %VMM5 + VPTESTN %VMM5, %VMM5, %k0 + KOR %k1, %k2, %k3 + subq $-(VEC_SIZE * 2), %rdi + /* If k0 and k3 zero, match and end of string not found. */ + KORTEST %k0, %k3 + jz L(loop) + + /* If k0 is non zero, end of string found. */ + KORTEST %k0, %k0 + jnz L(endloop) + + /* A match found, it need to be stored in r8 before loop + continue. */ + /* Check second vector first. */ + KMOV %k2, %RDX + test %RDX, %RDX + jz L(loop_vec_x3_ret) + + BSR %RDX, %RDX + leaq (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8 + jmp L(loop) + + /* If second vector doesn't have match, first vector must + have match. */ +L(loop_vec_x3_ret): + KMOV %k1, %R8 + BSR %R8, %R8 +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %r8, CHAR_SIZE), %r8 +# else + add %rdi, %r8 +# endif + jmp L(loop) + +L(endloop): + /* Check if string end in first loop vector. */ + VPTESTN %VMM3, %VMM3, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(vector_x3_end) + + /* Check if it has match in first loop vector. */ + KMOV %k1, %RAX + test %RAX, %RAX + jz L(vector_x4_end) + + BSR %RAX, %RAX + leaq (%rdi, %rax, CHAR_SIZE), %r8 + + /* String must end in second loop vector. */ +L(vector_x4_end): + VPTESTN %VMM4, %VMM4, %k0 + KMOV %k0, %RCX + KMOV %k2, %RAX + BLSMSK %RCX, %RCX + /* Check if it has match in second loop vector. */ + and %RCX, %RAX + jz L(check_last_match) + + BSR %RAX, %RAX + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* String end in first loop vector. */ +L(vector_x3_end): + KMOV %k1, %RAX + BLSMSK %RCX, %RCX + /* Check if it has match in second loop vector. */ + and %RCX, %RAX + jz L(check_last_match) + + BSR %RAX, %RAX + leaq (%rdi, %rax, CHAR_SIZE), %rax + ret + + /* No match in first and second loop vector. */ +L(check_last_match): + /* Check if any match recorded in r8. */ + test %r8, %r8 + jz L(vector_x2_ret) + movq %r8, %rax + ret + + /* No match recorded in r8. Check the second saved vector + in begining. */ +L(vector_x2_ret): + VPCMP $0, %VMM2, %VMM0, %k2 + KMOV %k2, %RAX + test %RAX, %RAX + jz L(vector_x1_ret) + + /* Match found in the second saved vector. */ + BSR %RAX, %RAX + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax + ret + +L(page_cross): + movl %eax, %ecx +# ifdef USE_AS_WCSRCHR + /* Calculate number of compare result bits to be skipped for + wide string alignment adjustment. */ + andl $(VEC_SIZE - 1), %ecx + sarl $2, %ecx +# endif + /* ecx contains number of w[char] to be skipped as a result + of address alignment. */ + xorq %rdi, %rax + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 + + VPTESTN %VMM1, %VMM1, %k1 + KMOV %k1, %RAX + SHR %cl, %RAX + jz L(page_cross_continue) + VPCMP $0, %VMM1, %VMM0, %k0 + KMOV %k0, %RDX + SHR %cl, %RDX + BLSMSK %RAX, %RAX + and %RDX, %RAX + jz L(ret) + BSR %RAX, %RAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif + + ret +END (STRRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S new file mode 100644 index 0000000000..f880848e09 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S @@ -0,0 +1,7 @@ +# ifndef STRRCHR +# define STRRCHR __strrchr_evex512 +# endif + +#define VEC_SIZE 64 + +#include "strrchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S new file mode 100644 index 0000000000..65b7710b22 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S @@ -0,0 +1,8 @@ +#ifndef WCSRCHR +# define WCSRCHR __wcsrchr_evex512 +#endif + +#define STRRCHR WCSRCHR +#define USE_AS_WCSRCHR 1 + +#include "strrchr-evex512.S"