Message ID | 20220922001652.4039546-1-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86_64: Implement evex512 version of strchrnul, strchr and wcschr | expand |
On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch implements following evex512 version of string functions. > evex512 version takes up to 30% less cycle as compared to evex, > depending on length and alignment. Please attach benchmark numbers. > > - strchrnul function using 512 bit vectors. > - strchr function using 512 bit vectors. > - wcschr function using 512 bit vectors. > > Code size data: > > strchrnul-evex.o 615 byte > strchrnul-evex512.o 573 byte (-7%) > > strchr-evex.o 670 byte > strchr-evex512.o 616 byte (-8%) > > wcschr-evex.o 678 byte > wcschr-evex512.o 620 byte (-9%) > > Placeholder function, not used by any processor at the moment. > --- > sysdeps/x86_64/multiarch/Makefile | 3 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 + > sysdeps/x86_64/multiarch/strchr-evex-base.S | 294 +++++++++++++++++++ > sysdeps/x86_64/multiarch/strchr-evex512.S | 7 + > sysdeps/x86_64/multiarch/strchrnul-evex512.S | 8 + > sysdeps/x86_64/multiarch/wcschr-evex512.S | 8 + > 6 files changed, 332 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S > create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index df4601c294..89b58fa557 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -60,11 +60,13 @@ sysdep_routines += \ > strchr-avx2 \ > strchr-avx2-rtm \ > strchr-evex \ > + strchr-evex512 \ > strchr-sse2 \ > strchr-sse2-no-bsf \ > strchrnul-avx2 \ > strchrnul-avx2-rtm \ > strchrnul-evex \ > + strchrnul-evex512 \ > strchrnul-sse2 \ > strcmp-avx2 \ > strcmp-avx2-rtm \ > @@ -129,6 +131,7 @@ sysdep_routines += \ > wcschr-avx2 \ > wcschr-avx2-rtm \ > wcschr-evex \ > + wcschr-evex512 \ > wcschr-sse2 \ > wcscmp-avx2 \ > wcscmp-avx2-rtm \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index a71444eccb..bce1d15171 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __strchr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW)), > + __strchr_evex512) > X86_IFUNC_IMPL_ADD_V3 (array, i, strchr, > (CPU_FEATURE_USABLE (AVX2) > && CPU_FEATURE_USABLE (BMI2)), > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __strchrnul_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW)), > + __strchrnul_evex512) > X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul, > (CPU_FEATURE_USABLE (AVX2) > && CPU_FEATURE_USABLE (BMI2)), > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (AVX512BW) > && CPU_FEATURE_USABLE (BMI2)), > __wcschr_evex) > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > + (CPU_FEATURE_USABLE (AVX512VL) > + && CPU_FEATURE_USABLE (AVX512BW)), > + __wcschr_evex512) > X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr, > (CPU_FEATURE_USABLE (AVX2) > && CPU_FEATURE_USABLE (BMI2)), > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S > new file mode 100644 > index 0000000000..919dafc8b6 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S > @@ -0,0 +1,294 @@ > +/* Placeholder function, not used by any processor at the moment. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* UNUSED. Exists purely as reference implementation. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > +# include <sysdep.h> > + > +# ifdef USE_AS_WCSCHR > +# define CHAR_REG esi > +# define CHAR_SIZE 4 > +# define VPBROADCAST vpbroadcastd > +# define VPCMP vpcmpd > +# define VPMINU vpminud > +# define VPTESTN vptestnmd > +# else > +# define CHAR_REG sil > +# define CHAR_SIZE 1 > +# define VPBROADCAST vpbroadcastb > +# define VPCMP vpcmpb > +# define VPMINU vpminub > +# define VPTESTN vptestnmb > +# endif > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > +# define XMM1 xmm17 > + > +# if VEC_SIZE == 64 > +# define KMOV kmovq > +# define KORTEST kortestq > +# define RAX rax > +# define RCX rcx > +# define RDX rdx > +# define SHR shrq > +# define TEXTSUFFIX evex512 > +# define VMM0 zmm16 > +# define VMM1 zmm17 > +# define VMM2 zmm18 > +# define VMM3 zmm19 > +# define VMM4 zmm20 > +# define VMM5 zmm21 > +# define VMOVA vmovdqa64 > +# define VMOVU vmovdqu64 > + > +# elif VEC_SIZE == 32 > +/* Currently Unused. */ > +# define KMOV kmovd > +# define KORTEST kortestd > +# define RAX eax > +# define RCX ecx > +# define RDX edx > +# define SHR shrl > +# define TEXTSUFFIX evex256 > +# define VMM0 ymm16 > +# define VMM1 ymm17 > +# define VMM2 ymm18 > +# define VMM3 ymm19 > +# define VMM4 ymm20 > +# define VMM5 ymm21 > +# define VMOVA vmovdqa32 > +# define VMOVU vmovdqu32 > +# endif > + > + .section .text.TEXTSUFFIX, "ax", @progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > +ENTRY_P2ALIGN (STRCHR, 6) > + > + /* Broadcast CHAR to VMM0. */ > + VPBROADCAST %esi, %VMM0 > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + /* Compare [w]char for null, mask bit will be set for match. */ > + VMOVU (%rdi), %VMM1 > + > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + > + KMOV %k0, %RAX > +# ifndef USE_AS_STRCHRNUL > + test %RAX, %RAX > + jz L(align_more) > + bsf %RAX, %RAX > +# else > + /* For strchnul, using bsf, if string is less than 64 byte, > + entire logic will fit in 64 byte cache line and offset > + the perf gap as compared to evex version. Even though > + using bsf as condition can save code size but it is not > + preferred for conditional jump for 2 reason. 1) It's > + latency is 3. 2) Unlike test, it can't be micro-fused > + with jump. */ > + bsf %RAX, %RAX > + jz L(align_more) > +# endif > + > +# ifdef USE_AS_WCSCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + add %rdi, %rax > +# endif > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG > + jne L(zero) > +# endif > + ret > + > +# ifndef USE_AS_STRCHRNUL > +L(zero): > + xorl %eax, %eax > + ret > +# endif > + > +L(ret_vec_x2): > + subq $-VEC_SIZE, %rax > +L(ret_vec_x1): > + bsf %RCX, %RCX > +# ifdef USE_AS_WCSCHR > + leaq (%rax, %rcx, CHAR_SIZE), %rax > +# else > + add %rcx, %rax > +# endif > + > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG > + jne L(zero) > +# endif > + ret > + > +L(align_more): > + leaq VEC_SIZE(%rdi), %rax > + /* Align rax to VEC_SIZE. */ > + andq $-VEC_SIZE, %rax > + > + /* Loop unroll 4 times for 4 vector loop. */ > + VMOVA (%rax), %VMM1 > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x1) > + > + VMOVA VEC_SIZE(%rax), %VMM1 > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x2) > + > + VMOVA (VEC_SIZE * 2)(%rax), %VMM1 > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 3)(%rax), %VMM1 > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x4) > + > + /* Align address to VEC_SIZE * 4 for loop. */ > + andq $-(VEC_SIZE * 4), %rax > + > + .p2align 4,,11 > +L(loop): > + /* VPMINU and VPCMP combination provide better performance as > + compared to alternative combinations. */ > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > + VMOVA (VEC_SIZE * 5)(%rax), %VMM2 > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > + VMOVA (VEC_SIZE * 7)(%rax), %VMM4 > + > + vpxorq %VMM1, %VMM0, %VMM5 > + VPMINU %VMM5, %VMM1, %VMM1 > + > + VPCMP $4, %VMM0, %VMM2, %k1 > + VPMINU %VMM1, %VMM2, %VMM2{%k1}{z} > + > + VPCMP $4, %VMM0, %VMM3, %k2 > + VPMINU %VMM2, %VMM3, %VMM3{%k2}{z} > + > + VPCMP $4, %VMM0, %VMM4, %k3 > + VPMINU %VMM3, %VMM4, %VMM4{%k3}{z} > + > + VPTESTN %VMM4, %VMM4, %k3 > + > + subq $-(VEC_SIZE * 4), %rax > + KORTEST %k3, %k3 > + jz L(loop) > + > + VPTESTN %VMM1, %VMM1, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x1) > + > + VPTESTN %VMM2, %VMM2, %k0 > + KMOV %k0, %RCX > + /* At this point, if k1 is non zero, null char must be in the > + second vector. */ > + test %RCX, %RCX > + jnz L(ret_vec_x2) > + > + VPTESTN %VMM3, %VMM3, %k0 > + KMOV %k0, %RCX > + test %RCX, %RCX > + jnz L(ret_vec_x3) > + /* At this point null [w]char must be in the fourth vector so no > + need to check. */ > + KMOV %k3, %RCX > + > +L(ret_vec_x4): > + bsf %RCX, %RCX > + leaq (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG > + jne L(zero) > +# endif > + ret > + > +L(ret_vec_x3): > + bsf %RCX, %RCX > + leaq (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG > + jne L(zero) > +# endif > + ret > + > +L(page_cross): > + movl %eax, %ecx > +# ifdef USE_AS_WCSCHR > + /* Calculate number of compare result bits to be skipped for > + wide string alignment adjustment. */ > + andl $(VEC_SIZE - 1), %ecx > + sarl $2, %ecx > +# endif > + /* ecx contains number of w[char] to be skipped as a result > + of address alignment. */ > + xorq %rdi, %rax > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > + vpxorq %VMM1, %VMM0, %VMM2 > + VPMINU %VMM2, %VMM1, %VMM2 > + VPTESTN %VMM2, %VMM2, %k0 > + KMOV %k0, %RAX > + /* Ignore number of character for alignment adjustment. */ > + SHR %cl, %RAX > + jz L(align_more) > + > + bsf %RAX, %RAX > +# ifdef USE_AS_WCSCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > + > +# ifndef USE_AS_STRCHRNUL > + cmp (%rax), %CHAR_REG > + jne L(zero) > +# endif > + ret > + > +END (STRCHR) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S > new file mode 100644 > index 0000000000..4079bf387d > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S > @@ -0,0 +1,7 @@ > +# ifndef STRCHR > +# define STRCHR __strchr_evex512 > +# endif > + > +#define VEC_SIZE 64 > + > +#include "strchr-evex-base.S" > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > new file mode 100644 > index 0000000000..1be0b12f38 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef STRCHRNUL > +# define STRCHRNUL __strchrnul_evex512 > +#endif > + > +#define STRCHR STRCHRNUL > +#define USE_AS_STRCHRNUL 1 > + > +#include "strchr-evex512.S" > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S > new file mode 100644 > index 0000000000..50c87ab1e5 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S > @@ -0,0 +1,8 @@ > +#ifndef WCSCHR > +# define WCSCHR __wcschr_evex512 > +#endif > + > +#define STRCHR WCSCHR > +#define USE_AS_WCSCHR 1 > + > +#include "strchr-evex512.S" > -- > 2.36.1 >
Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch implements following evex512 version of string functions. > > evex512 version takes up to 30% less cycle as compared to evex, > > depending on length and alignment. > > Please attach benchmark numbers. > > > > - strchrnul function using 512 bit vectors. > > - strchr function using 512 bit vectors. > > - wcschr function using 512 bit vectors. > > > > Code size data: > > > > strchrnul-evex.o 615 byte > > strchrnul-evex512.o 573 byte (-7%) > > > > strchr-evex.o 670 byte > > strchr-evex512.o 616 byte (-8%) > > > > wcschr-evex.o 678 byte > > wcschr-evex512.o 620 byte (-9%) > > > > Placeholder function, not used by any processor at the moment. > > --- > > sysdeps/x86_64/multiarch/Makefile | 3 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 + > > sysdeps/x86_64/multiarch/strchr-evex-base.S | 294 +++++++++++++++++++ > > sysdeps/x86_64/multiarch/strchr-evex512.S | 7 + > > sysdeps/x86_64/multiarch/strchrnul-evex512.S | 8 + > > sysdeps/x86_64/multiarch/wcschr-evex512.S | 8 + > > 6 files changed, 332 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S > > create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index df4601c294..89b58fa557 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -60,11 +60,13 @@ sysdep_routines += \ > > strchr-avx2 \ > > strchr-avx2-rtm \ > > strchr-evex \ > > + strchr-evex512 \ > > strchr-sse2 \ > > strchr-sse2-no-bsf \ > > strchrnul-avx2 \ > > strchrnul-avx2-rtm \ > > strchrnul-evex \ > > + strchrnul-evex512 \ > > strchrnul-sse2 \ > > strcmp-avx2 \ > > strcmp-avx2-rtm \ > > @@ -129,6 +131,7 @@ sysdep_routines += \ > > wcschr-avx2 \ > > wcschr-avx2-rtm \ > > wcschr-evex \ > > + wcschr-evex512 \ > > wcschr-sse2 \ > > wcscmp-avx2 \ > > wcscmp-avx2-rtm \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index a71444eccb..bce1d15171 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __strchr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW)), > > + __strchr_evex512) > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchr, > > (CPU_FEATURE_USABLE (AVX2) > > && CPU_FEATURE_USABLE (BMI2)), > > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __strchrnul_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW)), > > + __strchrnul_evex512) > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul, > > (CPU_FEATURE_USABLE (AVX2) > > && CPU_FEATURE_USABLE (BMI2)), > > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (AVX512BW) > > && CPU_FEATURE_USABLE (BMI2)), > > __wcschr_evex) > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW)), > > + __wcschr_evex512) > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr, > > (CPU_FEATURE_USABLE (AVX2) > > && CPU_FEATURE_USABLE (BMI2)), > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > new file mode 100644 > > index 0000000000..919dafc8b6 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > @@ -0,0 +1,294 @@ > > +/* Placeholder function, not used by any processor at the moment. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* UNUSED. Exists purely as reference implementation. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > +# include <sysdep.h> > > + > > +# ifdef USE_AS_WCSCHR > > +# define CHAR_REG esi > > +# define CHAR_SIZE 4 > > +# define VPBROADCAST vpbroadcastd > > +# define VPCMP vpcmpd > > +# define VPMINU vpminud > > +# define VPTESTN vptestnmd > > +# else > > +# define CHAR_REG sil > > +# define CHAR_SIZE 1 > > +# define VPBROADCAST vpbroadcastb > > +# define VPCMP vpcmpb > > +# define VPMINU vpminub > > +# define VPTESTN vptestnmb > > +# endif > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > +# define XMM1 xmm17 > > + > > +# if VEC_SIZE == 64 > > +# define KMOV kmovq > > +# define KORTEST kortestq > > +# define RAX rax > > +# define RCX rcx > > +# define RDX rdx > > +# define SHR shrq > > +# define TEXTSUFFIX evex512 > > +# define VMM0 zmm16 > > +# define VMM1 zmm17 > > +# define VMM2 zmm18 > > +# define VMM3 zmm19 > > +# define VMM4 zmm20 > > +# define VMM5 zmm21 > > +# define VMOVA vmovdqa64 > > +# define VMOVU vmovdqu64 > > + > > +# elif VEC_SIZE == 32 > > +/* Currently Unused. */ > > +# define KMOV kmovd > > +# define KORTEST kortestd > > +# define RAX eax > > +# define RCX ecx > > +# define RDX edx > > +# define SHR shrl > > +# define TEXTSUFFIX evex256 > > +# define VMM0 ymm16 > > +# define VMM1 ymm17 > > +# define VMM2 ymm18 > > +# define VMM3 ymm19 > > +# define VMM4 ymm20 > > +# define VMM5 ymm21 > > +# define VMOVA vmovdqa32 > > +# define VMOVU vmovdqu32 > > +# endif > > + > > + .section .text.TEXTSUFFIX, "ax", @progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > +ENTRY_P2ALIGN (STRCHR, 6) > > + > > + /* Broadcast CHAR to VMM0. */ > > + VPBROADCAST %esi, %VMM0 > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > + /* Compare [w]char for null, mask bit will be set for match. */ > > + VMOVU (%rdi), %VMM1 > > + > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + > > + KMOV %k0, %RAX > > +# ifndef USE_AS_STRCHRNUL > > + test %RAX, %RAX > > + jz L(align_more) > > + bsf %RAX, %RAX > > +# else > > + /* For strchnul, using bsf, if string is less than 64 byte, > > + entire logic will fit in 64 byte cache line and offset > > + the perf gap as compared to evex version. Even though > > + using bsf as condition can save code size but it is not > > + preferred for conditional jump for 2 reason. 1) It's > > + latency is 3. 2) Unlike test, it can't be micro-fused > > + with jump. */ > > + bsf %RAX, %RAX > > + jz L(align_more) > > +# endif > > + > > +# ifdef USE_AS_WCSCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + add %rdi, %rax > > +# endif > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + ret > > + > > +# ifndef USE_AS_STRCHRNUL > > +L(zero): > > + xorl %eax, %eax > > + ret > > +# endif > > + > > +L(ret_vec_x2): > > + subq $-VEC_SIZE, %rax > > +L(ret_vec_x1): > > + bsf %RCX, %RCX > > +# ifdef USE_AS_WCSCHR > > + leaq (%rax, %rcx, CHAR_SIZE), %rax > > +# else > > + add %rcx, %rax > > +# endif > > + > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + ret > > + > > +L(align_more): > > + leaq VEC_SIZE(%rdi), %rax > > + /* Align rax to VEC_SIZE. */ > > + andq $-VEC_SIZE, %rax > > + > > + /* Loop unroll 4 times for 4 vector loop. */ > > + VMOVA (%rax), %VMM1 > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x1) > > + > > + VMOVA VEC_SIZE(%rax), %VMM1 > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x2) > > + > > + VMOVA (VEC_SIZE * 2)(%rax), %VMM1 > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x3) > > + > > + VMOVA (VEC_SIZE * 3)(%rax), %VMM1 > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x4) > > + > > + /* Align address to VEC_SIZE * 4 for loop. */ > > + andq $-(VEC_SIZE * 4), %rax > > + > > + .p2align 4,,11 > > +L(loop): > > + /* VPMINU and VPCMP combination provide better performance as > > + compared to alternative combinations. */ > > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > > + VMOVA (VEC_SIZE * 5)(%rax), %VMM2 > > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > > + VMOVA (VEC_SIZE * 7)(%rax), %VMM4 > > + > > + vpxorq %VMM1, %VMM0, %VMM5 > > + VPMINU %VMM5, %VMM1, %VMM1 > > + > > + VPCMP $4, %VMM0, %VMM2, %k1 > > + VPMINU %VMM1, %VMM2, %VMM2{%k1}{z} > > + > > + VPCMP $4, %VMM0, %VMM3, %k2 > > + VPMINU %VMM2, %VMM3, %VMM3{%k2}{z} > > + > > + VPCMP $4, %VMM0, %VMM4, %k3 > > + VPMINU %VMM3, %VMM4, %VMM4{%k3}{z} > > + > > + VPTESTN %VMM4, %VMM4, %k3 > > + > > + subq $-(VEC_SIZE * 4), %rax > > + KORTEST %k3, %k3 > > + jz L(loop) > > + > > + VPTESTN %VMM1, %VMM1, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x1) > > + > > + VPTESTN %VMM2, %VMM2, %k0 > > + KMOV %k0, %RCX > > + /* At this point, if k1 is non zero, null char must be in the > > + second vector. */ > > + test %RCX, %RCX > > + jnz L(ret_vec_x2) > > + > > + VPTESTN %VMM3, %VMM3, %k0 > > + KMOV %k0, %RCX > > + test %RCX, %RCX > > + jnz L(ret_vec_x3) > > + /* At this point null [w]char must be in the fourth vector so no > > + need to check. */ > > + KMOV %k3, %RCX > > + > > +L(ret_vec_x4): > > + bsf %RCX, %RCX > > + leaq (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + ret > > + > > +L(ret_vec_x3): > > + bsf %RCX, %RCX > > + leaq (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + ret > > + > > +L(page_cross): > > + movl %eax, %ecx > > +# ifdef USE_AS_WCSCHR > > + /* Calculate number of compare result bits to be skipped for > > + wide string alignment adjustment. */ > > + andl $(VEC_SIZE - 1), %ecx > > + sarl $2, %ecx > > +# endif > > + /* ecx contains number of w[char] to be skipped as a result > > + of address alignment. */ > > + xorq %rdi, %rax > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > + vpxorq %VMM1, %VMM0, %VMM2 > > + VPMINU %VMM2, %VMM1, %VMM2 > > + VPTESTN %VMM2, %VMM2, %k0 > > + KMOV %k0, %RAX > > + /* Ignore number of character for alignment adjustment. */ > > + SHR %cl, %RAX > > + jz L(align_more) > > + > > + bsf %RAX, %RAX > > +# ifdef USE_AS_WCSCHR > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + addq %rdi, %rax > > +# endif > > + > > +# ifndef USE_AS_STRCHRNUL > > + cmp (%rax), %CHAR_REG > > + jne L(zero) > > +# endif > > + ret > > + > > +END (STRCHR) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S > > new file mode 100644 > > index 0000000000..4079bf387d > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S > > @@ -0,0 +1,7 @@ > > +# ifndef STRCHR > > +# define STRCHR __strchr_evex512 > > +# endif > > + > > +#define VEC_SIZE 64 > > + > > +#include "strchr-evex-base.S" > > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > new file mode 100644 > > index 0000000000..1be0b12f38 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > @@ -0,0 +1,8 @@ > > +#ifndef STRCHRNUL > > +# define STRCHRNUL __strchrnul_evex512 > > +#endif > > + > > +#define STRCHR STRCHRNUL > > +#define USE_AS_STRCHRNUL 1 > > + > > +#include "strchr-evex512.S" > > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > new file mode 100644 > > index 0000000000..50c87ab1e5 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > @@ -0,0 +1,8 @@ > > +#ifndef WCSCHR > > +# define WCSCHR __wcschr_evex512 > > +#endif > > + > > +#define STRCHR WCSCHR > > +#define USE_AS_WCSCHR 1 > > + > > +#include "strchr-evex512.S" > > -- > > 2.36.1 > > Function: strchrnul Variant: __strchrnul_evex __strchrnul_evex512 ======================================================================================================================== alignment=0, pos=32: 9.21 7.26 ( 21.22%) alignment=1, pos=32: 9.19 7.29 ( 20.76%) alignment=0, pos=64: 12.33 9.31 ( 24.48%) alignment=2, pos=64: 9.43 7.27 ( 22.91%) alignment=0, pos=128: 8.68 9.29 ( -6.99%) alignment=3, pos=128: 7.65 9.04 (-18.16%) alignment=0, pos=256: 14.07 9.80 ( 30.36%) alignment=4, pos=256: 14.27 9.75 ( 31.68%) alignment=0, pos=512: 20.16 16.58 ( 17.80%) alignment=5, pos=512: 21.46 17.57 ( 18.12%) alignment=0, pos=1024: 31.16 23.03 ( 26.07%) alignment=6, pos=1024: 31.09 23.12 ( 25.64%) alignment=0, pos=2048: 53.15 36.37 ( 31.58%) alignment=7, pos=2048: 53.16 36.43 ( 31.47%) alignment=0, pos=32: 6.08 5.08 ( 16.44%) alignment=1, pos=32: 6.07 5.09 ( 16.21%) alignment=0, pos=64: 7.91 6.41 ( 18.95%) alignment=2, pos=64: 8.02 6.41 ( 20.00%) alignment=0, pos=128: 8.26 9.15 (-10.73%) alignment=3, pos=128: 7.64 9.04 (-18.33%) alignment=0, pos=256: 14.05 9.80 ( 30.24%) alignment=4, pos=256: 14.26 9.77 ( 31.53%) alignment=0, pos=512: 20.10 16.58 ( 17.52%) alignment=5, pos=512: 20.20 16.61 ( 17.76%) alignment=0, pos=1024: 31.18 23.03 ( 26.15%) alignment=6, pos=1024: 31.09 23.12 ( 25.65%) alignment=0, pos=2048: 56.32 38.52 ( 31.60%) alignment=7, pos=2048: 53.42 40.10 ( 24.94%) alignment=1, pos=64: 8.04 6.40 ( 20.38%) alignment=1, pos=64: 7.53 6.40 ( 15.04%) alignment=2, pos=64: 7.48 6.39 ( 14.55%) alignment=2, pos=64: 7.99 6.43 ( 19.49%) alignment=3, pos=64: 7.99 6.45 ( 19.28%) alignment=3, pos=64: 8.00 6.45 ( 19.31%) alignment=4, pos=64: 8.00 6.45 ( 19.39%) alignment=4, pos=64: 7.98 6.46 ( 19.06%) alignment=5, pos=64: 7.94 6.40 ( 19.36%) alignment=5, pos=64: 8.37 6.76 ( 19.22%) alignment=6, pos=64: 8.03 6.41 ( 20.13%) alignment=6, pos=64: 8.05 6.42 ( 20.26%) alignment=7, pos=64: 8.04 6.41 ( 20.23%) alignment=7, pos=64: 8.03 6.40 ( 20.30%) alignment=0, pos=256: 14.24 9.79 ( 31.29%) alignment=0, pos=256: 14.22 9.75 ( 31.40%) alignment=16, pos=256: 14.11 9.79 ( 30.63%) alignment=16, pos=256: 14.24 9.77 ( 31.38%) alignment=32, pos=256: 14.76 9.77 ( 33.86%) alignment=32, pos=256: 14.70 9.78 ( 33.48%) alignment=48, pos=256: 14.65 9.75 ( 33.46%) alignment=48, pos=256: 14.46 9.80 ( 32.21%) alignment=64, pos=256: 15.52 9.81 ( 36.78%) alignment=64, pos=256: 15.50 9.76 ( 37.01%) alignment=80, pos=256: 15.50 9.79 ( 36.83%) alignment=80, pos=256: 15.40 9.75 ( 36.69%) alignment=96, pos=256: 13.08 9.79 ( 25.14%) alignment=96, pos=256: 13.03 9.81 ( 24.74%) alignment=112, pos=256: 13.20 9.76 ( 26.07%) alignment=112, pos=256: 13.23 9.75 ( 26.27%) alignment=0, pos=0: 5.19 5.09 ( 1.92%) alignment=0, pos=0: 5.37 4.67 ( 13.16%) alignment=0, pos=1: 5.33 4.71 ( 11.69%) alignment=0, pos=1: 5.33 4.67 ( 12.50%) alignment=0, pos=2: 5.37 4.67 ( 13.17%) alignment=0, pos=2: 5.33 4.67 ( 12.50%) alignment=0, pos=3: 5.33 4.67 ( 12.50%) alignment=0, pos=3: 5.33 4.67 ( 12.50%) alignment=0, pos=4: 5.45 5.10 ( 6.44%) alignment=0, pos=4: 5.45 5.09 ( 6.62%) alignment=0, pos=5: 5.47 5.10 ( 6.76%) alignment=0, pos=5: 5.46 5.08 ( 6.98%) alignment=0, pos=6: 5.42 5.09 ( 6.03%) alignment=0, pos=6: 5.41 5.07 ( 6.31%) alignment=0, pos=7: 5.70 5.35 ( 6.04%) alignment=0, pos=7: 5.33 4.71 ( 11.76%) alignment=0, pos=8: 5.44 5.08 ( 6.61%) alignment=0, pos=8: 5.44 5.07 ( 6.79%) alignment=0, pos=9: 5.42 5.07 ( 6.31%) alignment=0, pos=9: 5.54 5.11 ( 7.72%) alignment=0, pos=10: 5.42 5.07 ( 6.42%) alignment=0, pos=10: 5.42 5.14 ( 5.21%) alignment=0, pos=11: 5.45 5.08 ( 6.73%) alignment=0, pos=11: 5.39 5.07 ( 5.92%) alignment=0, pos=12: 5.46 5.08 ( 6.89%) alignment=0, pos=12: 5.40 5.11 ( 5.32%) alignment=0, pos=13: 5.42 5.07 ( 6.30%) alignment=0, pos=13: 5.41 5.13 ( 5.11%) alignment=0, pos=14: 5.39 5.08 ( 5.73%) alignment=0, pos=14: 5.43 5.08 ( 6.54%) alignment=0, pos=15: 5.50 5.08 ( 7.68%) alignment=0, pos=15: 5.40 5.12 ( 5.24%) alignment=0, pos=16: 5.41 5.09 ( 5.85%) alignment=0, pos=16: 5.44 5.07 ( 6.90%) alignment=0, pos=17: 5.42 5.09 ( 6.06%) alignment=0, pos=17: 5.40 5.08 ( 5.92%) alignment=0, pos=18: 5.41 5.12 ( 5.34%) alignment=0, pos=18: 5.45 5.09 ( 6.55%) alignment=0, pos=19: 5.45 5.09 ( 6.57%) alignment=0, pos=19: 5.41 5.10 ( 5.77%) alignment=0, pos=20: 5.41 5.09 ( 5.91%) alignment=0, pos=20: 5.41 5.12 ( 5.25%) alignment=0, pos=21: 5.44 5.08 ( 6.69%) alignment=0, pos=21: 5.31 5.10 ( 4.03%) alignment=0, pos=22: 5.45 5.10 ( 6.40%) alignment=0, pos=22: 5.41 5.11 ( 5.53%) alignment=0, pos=23: 5.39 5.11 ( 5.22%) alignment=0, pos=23: 5.42 5.09 ( 6.11%) alignment=0, pos=24: 5.39 5.11 ( 5.23%) alignment=0, pos=24: 5.41 5.08 ( 6.11%) alignment=0, pos=25: 5.43 5.09 ( 6.33%) alignment=0, pos=25: 5.41 5.08 ( 6.11%) alignment=0, pos=26: 5.42 5.09 ( 6.02%) alignment=0, pos=26: 5.45 5.08 ( 6.74%) alignment=0, pos=27: 5.40 5.12 ( 5.09%) alignment=0, pos=27: 5.42 5.08 ( 6.25%) alignment=0, pos=28: 5.42 5.07 ( 6.45%) alignment=0, pos=28: 5.72 5.39 ( 5.63%) alignment=0, pos=29: 5.41 5.10 ( 5.74%) alignment=0, pos=29: 5.42 5.09 ( 5.94%) alignment=0, pos=30: 5.41 5.08 ( 6.05%) alignment=0, pos=30: 5.40 5.08 ( 5.88%) alignment=0, pos=31: 5.41 5.09 ( 6.06%) alignment=0, pos=31: 5.41 5.08 ( 6.12%) alignment=0, pos=32: 6.09 5.11 ( 16.03%) alignment=1, pos=32: 6.10 5.08 ( 16.67%) alignment=0, pos=64: 8.00 6.42 ( 19.77%) alignment=2, pos=64: 7.99 6.41 ( 19.78%) alignment=0, pos=128: 7.51 9.10 (-21.15%) alignment=3, pos=128: 7.67 9.04 (-17.87%) alignment=0, pos=256: 14.23 9.76 ( 31.42%) alignment=4, pos=256: 14.26 9.76 ( 31.53%) alignment=0, pos=512: 20.05 16.61 ( 17.19%) alignment=5, pos=512: 20.10 16.62 ( 17.35%) alignment=0, pos=1024: 30.81 22.74 ( 26.20%) alignment=6, pos=1024: 31.33 23.15 ( 26.11%) alignment=0, pos=2048: 53.30 36.36 ( 31.79%) alignment=7, pos=2048: 53.38 36.37 ( 31.87%) alignment=0, pos=32: 6.05 5.07 ( 16.08%) alignment=1, pos=32: 6.10 5.09 ( 16.62%) alignment=0, pos=64: 7.98 6.80 ( 14.84%) alignment=2, pos=64: 8.00 6.48 ( 18.98%) alignment=0, pos=128: 7.60 9.10 (-19.74%) alignment=3, pos=128: 7.70 9.04 (-17.44%) alignment=0, pos=256: 14.26 9.80 ( 31.31%) alignment=4, pos=256: 14.10 9.74 ( 30.89%) alignment=0, pos=512: 20.88 16.59 ( 20.52%) alignment=5, pos=512: 20.24 16.59 ( 18.02%) alignment=0, pos=1024: 31.25 23.08 ( 26.16%) alignment=6, pos=1024: 30.76 23.01 ( 25.20%) alignment=0, pos=2048: 53.53 36.36 ( 32.07%) alignment=7, pos=2048: 53.20 36.38 ( 31.61%) alignment=1, pos=64: 7.62 6.40 ( 16.07%) alignment=1, pos=64: 7.68 6.43 ( 16.30%) alignment=2, pos=64: 7.84 6.41 ( 18.30%) alignment=2, pos=64: 8.04 6.41 ( 20.26%) alignment=3, pos=64: 8.03 6.40 ( 20.26%) alignment=3, pos=64: 7.85 6.40 ( 18.43%) alignment=4, pos=64: 8.00 6.42 ( 19.76%) alignment=4, pos=64: 7.99 6.46 ( 19.22%) alignment=5, pos=64: 8.04 6.40 ( 20.37%) alignment=5, pos=64: 8.19 6.77 ( 17.36%) alignment=6, pos=64: 8.43 6.76 ( 19.77%) alignment=6, pos=64: 8.42 6.84 ( 18.84%) alignment=7, pos=64: 7.98 6.40 ( 19.78%) alignment=7, pos=64: 8.00 6.43 ( 19.64%) alignment=0, pos=256: 14.27 9.81 ( 31.26%) alignment=0, pos=256: 14.00 9.75 ( 30.36%) alignment=16, pos=256: 14.25 9.76 ( 31.50%) alignment=16, pos=256: 14.06 9.78 ( 30.44%) alignment=32, pos=256: 14.80 9.81 ( 33.74%) alignment=32, pos=256: 14.77 9.79 ( 33.74%) alignment=48, pos=256: 14.99 9.79 ( 34.67%) alignment=48, pos=256: 14.67 9.81 ( 33.17%) alignment=64, pos=256: 15.49 9.80 ( 36.72%) alignment=64, pos=256: 15.50 9.79 ( 36.86%) alignment=80, pos=256: 15.51 10.21 ( 34.22%) alignment=80, pos=256: 15.38 9.79 ( 36.39%) alignment=96, pos=256: 13.09 9.78 ( 25.25%) alignment=96, pos=256: 13.06 9.89 ( 24.31%) alignment=112, pos=256: 13.17 9.79 ( 25.69%) alignment=112, pos=256: 13.10 9.81 ( 25.15%) alignment=0, pos=0: 5.19 5.12 ( 1.41%) alignment=0, pos=0: 5.33 4.67 ( 12.50%) alignment=0, pos=1: 5.33 4.81 ( 9.79%) alignment=0, pos=1: 5.33 4.67 ( 12.50%) alignment=0, pos=2: 5.33 4.70 ( 11.87%) alignment=0, pos=2: 5.33 4.67 ( 12.50%) alignment=0, pos=3: 5.33 4.67 ( 12.50%) alignment=0, pos=3: 5.37 4.67 ( 13.15%) alignment=0, pos=4: 5.33 4.71 ( 11.75%) alignment=0, pos=4: 5.33 4.67 ( 12.50%) alignment=0, pos=5: 5.33 4.67 ( 12.50%) alignment=0, pos=5: 5.33 4.67 ( 12.50%) alignment=0, pos=6: 5.41 5.08 ( 6.13%) alignment=0, pos=6: 5.45 5.08 ( 6.79%) alignment=0, pos=7: 5.43 5.08 ( 6.50%) alignment=0, pos=7: 5.46 5.08 ( 7.04%) alignment=0, pos=8: 5.40 5.08 ( 5.99%) alignment=0, pos=8: 5.41 5.08 ( 6.16%) alignment=0, pos=9: 5.33 4.67 ( 12.50%) alignment=0, pos=9: 5.33 4.71 ( 11.76%) alignment=0, pos=10: 5.45 5.10 ( 6.39%) alignment=0, pos=10: 5.43 5.09 ( 6.28%) alignment=0, pos=11: 5.40 5.09 ( 5.76%) alignment=0, pos=11: 5.33 4.71 ( 11.75%) alignment=0, pos=12: 5.39 5.07 ( 5.86%) alignment=0, pos=12: 5.40 5.09 ( 5.61%) alignment=0, pos=13: 5.46 5.08 ( 6.82%) alignment=0, pos=13: 5.42 5.10 ( 5.90%) alignment=0, pos=14: 5.45 5.08 ( 6.70%) alignment=0, pos=14: 5.42 5.08 ( 6.24%) alignment=0, pos=15: 5.42 5.07 ( 6.38%) alignment=0, pos=15: 5.42 5.14 ( 5.12%) alignment=0, pos=16: 5.40 5.08 ( 5.83%) alignment=0, pos=16: 5.42 5.09 ( 6.14%) alignment=0, pos=17: 5.40 5.10 ( 5.48%) alignment=0, pos=17: 5.41 5.07 ( 6.40%) alignment=0, pos=18: 5.44 5.08 ( 6.56%) alignment=0, pos=18: 5.39 5.07 ( 5.86%) alignment=0, pos=19: 5.41 5.08 ( 6.22%) alignment=0, pos=19: 5.42 5.09 ( 6.09%) alignment=0, pos=20: 5.41 5.09 ( 5.83%) alignment=0, pos=20: 5.39 5.14 ( 4.76%) alignment=0, pos=21: 5.42 5.11 ( 5.69%) alignment=0, pos=21: 5.43 5.09 ( 6.20%) alignment=0, pos=22: 5.69 5.41 ( 4.96%) alignment=0, pos=22: 5.40 5.15 ( 4.71%) alignment=0, pos=23: 5.44 5.09 ( 6.52%) alignment=0, pos=23: 5.42 5.08 ( 6.31%) alignment=0, pos=24: 5.40 5.12 ( 5.12%) alignment=0, pos=24: 5.43 5.08 ( 6.38%) alignment=0, pos=25: 5.41 5.09 ( 5.98%) alignment=0, pos=25: 5.46 5.08 ( 6.93%) alignment=0, pos=26: 5.41 5.08 ( 6.19%) alignment=0, pos=26: 5.45 5.09 ( 6.62%) alignment=0, pos=27: 5.41 5.12 ( 5.40%) alignment=0, pos=27: 5.40 5.08 ( 5.95%) alignment=0, pos=28: 5.44 5.07 ( 6.80%) alignment=0, pos=28: 5.39 5.13 ( 4.75%) alignment=0, pos=29: 5.39 5.08 ( 5.84%) alignment=0, pos=29: 5.39 5.09 ( 5.66%) alignment=0, pos=30: 5.50 5.07 ( 7.86%) alignment=0, pos=30: 5.41 5.10 ( 5.84%) alignment=0, pos=31: 5.39 5.13 ( 4.94%) alignment=0, pos=31: 5.39 5.13 ( 4.80%) Function: wcschr Variant: __wcschr_evex __wcschr_evex512 ======================================================================================================================== alignment=1, pos=64: 15.96 10.14 ( 36.48%) alignment=1, pos=64: 15.89 10.17 ( 35.95%) alignment=2, pos=64: 15.23 10.09 ( 33.70%) alignment=2, pos=64: 15.82 10.13 ( 35.97%) alignment=3, pos=64: 14.97 9.97 ( 33.38%) alignment=3, pos=64: 15.02 9.59 ( 36.15%) alignment=4, pos=64: 14.87 9.52 ( 35.96%) alignment=4, pos=64: 14.98 9.59 ( 35.97%) alignment=5, pos=64: 15.02 9.59 ( 36.19%) alignment=5, pos=64: 14.87 9.57 ( 35.67%) alignment=6, pos=64: 14.96 9.59 ( 35.92%) alignment=6, pos=64: 14.97 9.33 ( 37.64%) alignment=7, pos=64: 15.03 9.53 ( 36.60%) alignment=7, pos=64: 14.99 9.58 ( 36.07%) alignment=0, pos=256: 31.47 22.50 ( 28.51%) alignment=0, pos=256: 31.44 22.50 ( 28.44%) alignment=16, pos=256: 33.48 23.18 ( 30.77%) alignment=16, pos=256: 33.47 23.14 ( 30.86%) alignment=32, pos=256: 31.08 23.79 ( 23.47%) alignment=32, pos=256: 31.43 23.67 ( 24.68%) alignment=48, pos=256: 32.88 20.82 ( 36.68%) alignment=48, pos=256: 32.84 20.84 ( 36.54%) alignment=64, pos=256: 30.94 23.31 ( 24.66%) alignment=64, pos=256: 33.00 23.82 ( 27.81%) alignment=80, pos=256: 32.86 23.15 ( 29.56%) alignment=80, pos=256: 33.01 23.20 ( 29.73%) alignment=96, pos=256: 30.87 23.65 ( 23.38%) alignment=96, pos=256: 30.91 23.66 ( 23.44%) alignment=112, pos=256: 32.86 20.83 ( 36.62%) alignment=112, pos=256: 33.08 20.07 ( 39.32%) alignment=0, pos=0: 5.84 5.11 ( 12.42%) alignment=0, pos=0: 6.04 4.67 ( 22.75%) alignment=0, pos=1: 6.00 4.71 ( 21.53%) alignment=0, pos=1: 6.00 4.71 ( 21.55%) alignment=0, pos=2: 6.00 4.71 ( 21.56%) alignment=0, pos=2: 6.00 4.67 ( 22.22%) alignment=0, pos=3: 6.00 4.71 ( 21.56%) alignment=0, pos=3: 6.00 4.71 ( 21.56%) alignment=0, pos=4: 6.06 5.12 ( 15.57%) alignment=0, pos=4: 6.09 5.09 ( 16.45%) alignment=0, pos=5: 6.00 4.67 ( 22.22%) alignment=0, pos=5: 6.00 4.67 ( 22.22%) alignment=0, pos=6: 6.22 5.09 ( 18.11%) alignment=0, pos=6: 6.11 5.11 ( 16.38%) alignment=0, pos=7: 6.00 4.67 ( 22.22%) alignment=0, pos=7: 6.00 4.70 ( 21.66%) alignment=0, pos=8: 6.12 5.11 ( 16.49%) alignment=0, pos=8: 6.00 4.71 ( 21.57%) alignment=0, pos=9: 6.07 5.11 ( 15.69%) alignment=0, pos=9: 5.84 5.13 ( 12.07%) alignment=0, pos=10: 6.08 5.13 ( 15.71%) alignment=0, pos=10: 5.84 5.17 ( 11.48%) alignment=0, pos=11: 6.08 5.10 ( 16.01%) alignment=0, pos=11: 6.02 5.09 ( 15.53%) alignment=0, pos=12: 6.00 4.67 ( 22.22%) alignment=0, pos=12: 6.11 5.09 ( 16.66%) alignment=0, pos=13: 5.84 5.13 ( 12.12%) alignment=0, pos=13: 6.00 4.67 ( 22.22%) alignment=0, pos=14: 6.11 5.09 ( 16.67%) alignment=0, pos=14: 6.20 5.09 ( 17.89%) alignment=0, pos=15: 6.00 4.71 ( 21.56%) alignment=0, pos=15: 6.04 5.14 ( 14.97%) alignment=0, pos=16: 7.41 7.75 ( -4.59%) alignment=0, pos=16: 6.72 7.75 (-15.30%) alignment=0, pos=17: 6.71 7.75 (-15.41%) alignment=0, pos=17: 6.71 7.79 (-16.10%) alignment=0, pos=18: 6.67 7.38 (-10.65%) alignment=0, pos=18: 6.67 7.38 (-10.65%) alignment=0, pos=19: 6.78 7.80 (-15.14%) alignment=0, pos=19: 6.78 7.74 (-14.14%) alignment=0, pos=20: 6.71 7.76 (-15.62%) alignment=0, pos=20: 6.72 7.79 (-15.91%) alignment=0, pos=21: 6.78 7.74 (-14.07%) alignment=0, pos=21: 6.72 7.73 (-15.01%) alignment=0, pos=22: 6.71 7.78 (-15.85%) alignment=0, pos=22: 6.88 7.80 (-13.40%) alignment=0, pos=23: 6.74 7.74 (-14.91%) alignment=0, pos=23: 6.71 7.77 (-15.71%) alignment=0, pos=24: 7.62 7.74 ( -1.52%) alignment=0, pos=24: 7.39 7.78 ( -5.26%) alignment=0, pos=25: 7.37 7.74 ( -5.06%) alignment=0, pos=25: 7.42 7.75 ( -4.51%) alignment=0, pos=26: 7.38 7.79 ( -5.49%) alignment=0, pos=26: 7.37 7.79 ( -5.71%) alignment=0, pos=27: 7.38 7.72 ( -4.61%) alignment=0, pos=27: 7.44 7.73 ( -3.93%) alignment=0, pos=28: 7.40 7.72 ( -4.40%) alignment=0, pos=28: 7.43 7.73 ( -4.02%) alignment=0, pos=29: 7.40 7.73 ( -4.36%) alignment=0, pos=29: 7.43 7.73 ( -4.07%) alignment=0, pos=30: 7.45 7.73 ( -3.86%) alignment=0, pos=30: 7.36 7.79 ( -5.79%) alignment=0, pos=31: 7.36 7.79 ( -5.82%) alignment=0, pos=31: 7.36 7.78 ( -5.73%) alignment=1, pos=64: 15.02 9.54 ( 36.48%) alignment=1, pos=64: 14.80 9.32 ( 37.02%) alignment=2, pos=64: 15.04 9.57 ( 36.37%) alignment=2, pos=64: 14.86 9.27 ( 37.59%) alignment=3, pos=64: 15.03 9.58 ( 36.23%) alignment=3, pos=64: 15.01 9.57 ( 36.25%) alignment=4, pos=64: 14.85 9.52 ( 35.92%) alignment=4, pos=64: 14.98 9.57 ( 36.12%) alignment=5, pos=64: 15.00 9.57 ( 36.20%) alignment=5, pos=64: 15.03 9.56 ( 36.38%) alignment=6, pos=64: 14.91 9.29 ( 37.69%) alignment=6, pos=64: 14.97 9.57 ( 36.06%) alignment=7, pos=64: 15.03 9.32 ( 38.03%) alignment=7, pos=64: 14.88 9.52 ( 35.99%) alignment=0, pos=256: 31.49 22.53 ( 28.46%) alignment=0, pos=256: 31.44 22.49 ( 28.47%) alignment=16, pos=256: 35.25 24.55 ( 30.36%) alignment=16, pos=256: 33.41 23.18 ( 30.61%) alignment=32, pos=256: 32.87 23.63 ( 28.11%) alignment=32, pos=256: 32.53 23.96 ( 26.34%) alignment=48, pos=256: 32.74 21.50 ( 34.34%) alignment=48, pos=256: 33.19 20.86 ( 37.15%) alignment=64, pos=256: 31.01 22.47 ( 27.53%) alignment=64, pos=256: 30.98 22.50 ( 27.38%) alignment=80, pos=256: 33.02 23.21 ( 29.72%) alignment=80, pos=256: 32.96 23.14 ( 29.79%) alignment=96, pos=256: 30.93 23.62 ( 23.64%) alignment=96, pos=256: 30.89 23.65 ( 23.43%) alignment=112, pos=256: 32.78 20.83 ( 36.46%) alignment=112, pos=256: 32.82 20.83 ( 36.53%) alignment=0, pos=0: 5.84 5.11 ( 12.44%) alignment=0, pos=0: 6.00 4.67 ( 22.22%) alignment=0, pos=1: 6.00 4.67 ( 22.22%) alignment=0, pos=1: 6.04 4.67 ( 22.73%) alignment=0, pos=2: 6.04 4.67 ( 22.74%) alignment=0, pos=2: 6.04 4.67 ( 22.75%) alignment=0, pos=3: 6.05 5.11 ( 15.49%) alignment=0, pos=3: 6.00 4.67 ( 22.22%) alignment=0, pos=4: 6.00 4.67 ( 22.22%) alignment=0, pos=4: 6.00 4.71 ( 21.56%) alignment=0, pos=5: 6.00 4.71 ( 21.56%) alignment=0, pos=5: 6.05 4.67 ( 22.80%) alignment=0, pos=6: 6.00 4.67 ( 22.22%) alignment=0, pos=6: 6.00 4.71 ( 21.56%) alignment=0, pos=7: 6.11 5.12 ( 16.22%) alignment=0, pos=7: 6.06 5.11 ( 15.77%) alignment=0, pos=8: 6.00 4.70 ( 21.66%) alignment=0, pos=8: 6.06 5.09 ( 15.92%) alignment=0, pos=9: 5.84 5.14 ( 11.96%) alignment=0, pos=9: 6.13 5.10 ( 16.82%) alignment=0, pos=10: 5.84 5.14 ( 11.92%) alignment=0, pos=10: 6.08 5.11 ( 15.93%) alignment=0, pos=11: 5.84 5.15 ( 11.82%) alignment=0, pos=11: 6.09 5.09 ( 16.45%) alignment=0, pos=12: 6.43 5.09 ( 20.90%) alignment=0, pos=12: 6.00 4.71 ( 21.56%) alignment=0, pos=13: 8.71 8.40 ( 3.63%) alignment=0, pos=13: 7.05 4.94 ( 29.88%) alignment=0, pos=14: 7.63 5.58 ( 26.87%) alignment=0, pos=14: 7.70 6.00 ( 22.02%) alignment=0, pos=15: 6.55 5.66 ( 13.55%) alignment=0, pos=15: 6.40 5.38 ( 15.89%) alignment=0, pos=16: 7.77 8.30 ( -6.82%) alignment=0, pos=16: 7.06 7.85 (-11.14%) alignment=0, pos=17: 7.18 8.35 (-16.21%) alignment=0, pos=17: 7.35 8.18 (-11.21%) alignment=0, pos=18: 7.11 8.26 (-16.14%) alignment=0, pos=18: 6.92 7.74 (-11.85%) alignment=0, pos=19: 6.80 7.74 (-13.80%) alignment=0, pos=19: 6.89 7.73 (-12.20%) alignment=0, pos=20: 6.75 7.77 (-15.22%) alignment=0, pos=20: 6.67 7.37 (-10.60%) alignment=0, pos=21: 6.76 7.76 (-14.83%) alignment=0, pos=21: 6.73 7.72 (-14.81%) alignment=0, pos=22: 6.73 7.79 (-15.78%) alignment=0, pos=22: 6.75 7.75 (-14.70%) alignment=0, pos=23: 6.73 7.76 (-15.30%) alignment=0, pos=23: 6.72 8.50 (-26.43%) alignment=0, pos=24: 8.03 7.75 ( 3.53%) alignment=0, pos=24: 7.43 7.75 ( -4.32%) alignment=0, pos=25: 7.37 7.78 ( -5.63%) alignment=0, pos=25: 7.51 7.79 ( -3.67%) alignment=0, pos=26: 7.38 7.80 ( -5.70%) alignment=0, pos=26: 7.43 7.73 ( -3.96%) alignment=0, pos=27: 7.41 7.74 ( -4.47%) alignment=0, pos=27: 7.39 7.78 ( -5.24%) alignment=0, pos=28: 7.38 7.78 ( -5.42%) alignment=0, pos=28: 7.39 7.73 ( -4.73%) alignment=0, pos=29: 7.38 7.74 ( -4.86%) alignment=0, pos=29: 7.43 7.73 ( -4.08%) alignment=0, pos=30: 7.39 7.77 ( -5.18%) alignment=0, pos=30: 7.38 7.78 ( -5.37%) alignment=0, pos=31: 7.42 7.78 ( -4.85%) alignment=0, pos=31: 7.40 7.77 ( -5.04%) alignment=0, pos=15: 8.09 7.80 ( 3.58%) alignment=0, pos=15: 7.83 6.74 ( 13.95%) alignment=0, pos=15: 10.61 10.60 ( 0.05%) alignment=0, pos=15: 10.04 9.27 ( 7.66%) alignment=0, pos=15: 11.82 10.94 ( 7.45%) alignment=0, pos=15: 11.59 11.77 ( -1.56%) alignment=0, pos=15: 13.17 12.26 ( 6.84%) alignment=0, pos=15: 12.78 11.66 ( 8.75%) alignment=0, pos=15: 16.30 14.06 ( 13.72%) alignment=0, pos=15: 16.66 13.95 ( 16.30%) alignment=0, pos=15: 12.08 10.97 ( 9.21%) alignment=0, pos=15: 12.45 10.89 ( 12.50%) alignment=0, pos=15: 10.73 8.85 ( 17.50%) alignment=0, pos=15: 10.85 8.95 ( 17.57%) alignment=0, pos=15: 8.69 6.85 ( 21.21%) alignment=0, pos=15: 8.47 7.00 ( 17.34%) alignment=0, pos=15: 8.11 7.34 ( 9.51%) alignment=0, pos=15: 7.94 7.12 ( 10.31%) Function: strchr Variant: __strchr_evex __strchr_evex512 ======================================================================================================================== alignment=0, pos=32: 9.22 6.28 ( 31.96%) alignment=1, pos=32: 8.64 5.81 ( 32.70%) alignment=0, pos=64: 8.78 8.03 ( 8.54%) alignment=2, pos=64: 8.24 7.66 ( 7.12%) alignment=0, pos=128: 9.04 7.80 ( 13.77%) alignment=3, pos=128: 8.30 6.96 ( 16.22%) alignment=0, pos=256: 14.01 8.76 ( 37.53%) alignment=4, pos=256: 14.20 8.72 ( 38.60%) alignment=0, pos=512: 20.50 16.21 ( 20.92%) alignment=5, pos=512: 20.65 16.47 ( 20.26%) alignment=0, pos=1024: 31.78 23.15 ( 27.14%) alignment=6, pos=1024: 31.81 23.13 ( 27.27%) alignment=0, pos=2048: 54.18 36.24 ( 33.11%) alignment=7, pos=2048: 54.39 36.22 ( 33.41%) alignment=0, pos=32: 6.09 4.45 ( 26.87%) alignment=1, pos=32: 6.08 4.45 ( 26.75%) alignment=0, pos=64: 6.78 6.41 ( 5.35%) alignment=2, pos=64: 6.71 6.43 ( 4.14%) alignment=0, pos=128: 8.13 7.08 ( 12.89%) alignment=3, pos=128: 8.09 7.14 ( 11.73%) alignment=0, pos=256: 13.98 8.68 ( 37.91%) alignment=4, pos=256: 14.41 8.71 ( 39.54%) alignment=0, pos=512: 20.68 16.19 ( 21.70%) alignment=5, pos=512: 20.67 16.25 ( 21.39%) alignment=0, pos=1024: 32.91 23.35 ( 29.06%) alignment=6, pos=1024: 31.78 22.96 ( 27.76%) alignment=0, pos=2048: 53.93 36.19 ( 32.89%) alignment=7, pos=2048: 53.89 36.19 ( 32.84%) alignment=1, pos=64: 6.74 6.53 ( 3.02%) alignment=1, pos=64: 6.78 6.42 ( 5.33%) alignment=2, pos=64: 6.74 6.46 ( 4.20%) alignment=2, pos=64: 6.77 6.43 ( 5.03%) alignment=3, pos=64: 6.59 6.44 ( 2.15%) alignment=3, pos=64: 6.76 6.43 ( 4.89%) alignment=4, pos=64: 6.71 6.47 ( 3.61%) alignment=4, pos=64: 6.75 6.41 ( 4.98%) alignment=5, pos=64: 6.77 6.48 ( 4.27%) alignment=5, pos=64: 6.86 6.44 ( 6.16%) alignment=6, pos=64: 6.77 6.43 ( 5.13%) alignment=6, pos=64: 6.74 6.44 ( 4.48%) alignment=7, pos=64: 7.16 6.79 ( 5.17%) alignment=7, pos=64: 7.32 6.79 ( 7.16%) alignment=0, pos=256: 14.12 8.72 ( 38.20%) alignment=0, pos=256: 13.92 8.74 ( 37.19%) alignment=16, pos=256: 14.43 8.73 ( 39.49%) alignment=16, pos=256: 13.88 8.77 ( 36.82%) alignment=32, pos=256: 14.84 8.78 ( 40.84%) alignment=32, pos=256: 14.85 8.76 ( 40.98%) alignment=48, pos=256: 15.31 8.77 ( 42.70%) alignment=48, pos=256: 21.82 8.78 ( 59.77%) alignment=64, pos=256: 16.06 8.73 ( 45.63%) alignment=64, pos=256: 15.95 8.58 ( 46.22%) alignment=80, pos=256: 17.02 9.22 ( 45.82%) alignment=80, pos=256: 15.75 8.74 ( 44.48%) alignment=96, pos=256: 13.33 8.80 ( 33.96%) alignment=96, pos=256: 13.30 8.76 ( 34.12%) alignment=112, pos=256: 13.57 8.77 ( 35.35%) alignment=112, pos=256: 13.47 8.74 ( 35.11%) alignment=0, pos=0: 3.89 4.44 (-14.06%) alignment=0, pos=0: 4.04 4.00 ( 0.99%) alignment=0, pos=1: 4.00 4.04 ( -0.98%) alignment=0, pos=1: 4.00 4.04 ( -0.98%) alignment=0, pos=2: 4.04 4.00 ( 0.98%) alignment=0, pos=2: 4.06 4.00 ( 1.38%) alignment=0, pos=3: 4.00 4.00 ( 0.00%) alignment=0, pos=3: 4.00 4.00 ( 0.00%) alignment=0, pos=4: 4.26 4.49 ( -5.34%) alignment=0, pos=4: 4.29 4.44 ( -3.38%) alignment=0, pos=5: 4.00 4.00 ( 0.00%) alignment=0, pos=5: 4.00 4.00 ( 0.00%) alignment=0, pos=6: 4.00 4.04 ( -1.00%) alignment=0, pos=6: 4.04 4.00 ( 0.98%) alignment=0, pos=7: 4.00 4.00 ( -0.00%) alignment=0, pos=7: 4.00 4.00 ( 0.00%) alignment=0, pos=8: 4.31 4.49 ( -4.12%) alignment=0, pos=8: 4.34 4.44 ( -2.30%) alignment=0, pos=9: 4.00 4.00 ( 0.00%) alignment=0, pos=9: 4.00 4.00 ( -0.00%) alignment=0, pos=10: 4.21 4.49 ( -6.75%) alignment=0, pos=10: 4.00 4.00 ( -0.00%) alignment=0, pos=11: 4.00 4.04 ( -0.99%) alignment=0, pos=11: 4.18 4.44 ( -6.31%) alignment=0, pos=12: 4.00 4.00 ( 0.00%) alignment=0, pos=12: 4.21 4.44 ( -5.32%) alignment=0, pos=13: 4.00 4.00 ( 0.00%) alignment=0, pos=13: 4.18 4.45 ( -6.37%) alignment=0, pos=14: 4.00 4.00 ( -0.00%) alignment=0, pos=14: 4.19 4.45 ( -6.12%) alignment=0, pos=15: 4.00 4.00 ( 0.01%) alignment=0, pos=15: 4.13 4.46 ( -7.83%) alignment=0, pos=16: 4.26 4.49 ( -5.44%) alignment=0, pos=16: 4.00 4.00 ( 0.00%) alignment=0, pos=17: 4.22 4.46 ( -5.68%) alignment=0, pos=17: 4.14 4.46 ( -7.77%) alignment=0, pos=18: 4.18 4.51 ( -7.85%) alignment=0, pos=18: 4.17 4.47 ( -7.05%) alignment=0, pos=19: 3.99 4.49 (-12.42%) alignment=0, pos=19: 4.17 4.48 ( -7.53%) alignment=0, pos=20: 4.16 4.47 ( -7.53%) alignment=0, pos=20: 4.13 4.45 ( -7.72%) alignment=0, pos=21: 4.18 4.46 ( -6.86%) alignment=0, pos=21: 4.16 4.45 ( -6.97%) alignment=0, pos=22: 4.13 4.45 ( -7.68%) alignment=0, pos=22: 4.14 4.45 ( -7.62%) alignment=0, pos=23: 4.16 4.44 ( -6.77%) alignment=0, pos=23: 4.15 4.46 ( -7.54%) alignment=0, pos=24: 4.16 4.49 ( -7.83%) alignment=0, pos=24: 4.16 4.49 ( -8.04%) alignment=0, pos=25: 4.19 4.45 ( -6.34%) alignment=0, pos=25: 4.16 4.50 ( -8.33%) alignment=0, pos=26: 4.17 4.46 ( -7.10%) alignment=0, pos=26: 4.16 4.50 ( -8.36%) alignment=0, pos=27: 4.20 4.44 ( -5.66%) alignment=0, pos=27: 4.15 4.45 ( -7.21%) alignment=0, pos=28: 4.20 4.46 ( -6.19%) alignment=0, pos=28: 4.15 4.49 ( -8.07%) alignment=0, pos=29: 4.19 4.45 ( -6.28%) alignment=0, pos=29: 4.14 4.46 ( -7.74%) alignment=0, pos=30: 4.14 4.50 ( -8.67%) alignment=0, pos=30: 4.22 4.45 ( -5.54%) alignment=0, pos=31: 4.16 4.46 ( -7.11%) alignment=0, pos=31: 4.17 4.44 ( -6.35%) alignment=0, pos=32: 6.06 4.44 ( 26.70%) alignment=1, pos=32: 6.11 4.44 ( 27.31%) alignment=0, pos=64: 6.78 6.43 ( 5.07%) alignment=2, pos=64: 6.71 6.43 ( 4.19%) alignment=0, pos=128: 8.00 7.09 ( 11.43%) alignment=3, pos=128: 8.11 7.14 ( 12.01%) alignment=0, pos=256: 14.38 8.72 ( 39.40%) alignment=4, pos=256: 13.90 8.74 ( 37.12%) alignment=0, pos=512: 20.64 16.18 ( 21.64%) alignment=5, pos=512: 20.69 16.23 ( 21.55%) alignment=0, pos=1024: 31.85 23.10 ( 27.48%) alignment=6, pos=1024: 31.78 23.15 ( 27.14%) alignment=0, pos=2048: 54.06 36.21 ( 33.01%) alignment=7, pos=2048: 54.14 36.28 ( 32.99%) alignment=0, pos=32: 6.09 4.45 ( 27.00%) alignment=1, pos=32: 6.12 4.46 ( 27.09%) alignment=0, pos=64: 6.81 6.53 ( 4.11%) alignment=2, pos=64: 6.76 6.43 ( 4.97%) alignment=0, pos=128: 8.11 7.10 ( 12.44%) alignment=3, pos=128: 8.07 7.13 ( 11.68%) alignment=0, pos=256: 14.23 8.57 ( 39.76%) alignment=4, pos=256: 13.90 8.74 ( 37.13%) alignment=0, pos=512: 20.25 16.19 ( 20.06%) alignment=5, pos=512: 20.67 16.45 ( 20.41%) alignment=0, pos=1024: 31.78 23.13 ( 27.24%) alignment=6, pos=1024: 31.72 23.14 ( 27.06%) alignment=0, pos=2048: 53.96 36.21 ( 32.89%) alignment=7, pos=2048: 53.96 36.16 ( 32.99%) alignment=1, pos=64: 7.17 6.78 ( 5.39%) alignment=1, pos=64: 6.78 6.44 ( 5.01%) alignment=2, pos=64: 6.74 6.47 ( 4.08%) alignment=2, pos=64: 6.74 6.41 ( 4.83%) alignment=3, pos=64: 6.75 6.41 ( 4.97%) alignment=3, pos=64: 6.73 6.43 ( 4.49%) alignment=4, pos=64: 6.76 6.52 ( 3.63%) alignment=4, pos=64: 6.73 6.43 ( 4.58%) alignment=5, pos=64: 6.73 6.47 ( 3.80%) alignment=5, pos=64: 6.74 6.43 ( 4.58%) alignment=6, pos=64: 6.58 6.48 ( 1.59%) alignment=6, pos=64: 6.74 6.42 ( 4.65%) alignment=7, pos=64: 6.71 6.47 ( 3.66%) alignment=7, pos=64: 6.74 6.43 ( 4.58%) alignment=0, pos=256: 14.45 8.71 ( 39.72%) alignment=0, pos=256: 13.88 8.73 ( 37.14%) alignment=16, pos=256: 14.42 8.77 ( 39.17%) alignment=16, pos=256: 13.90 8.59 ( 38.20%) alignment=32, pos=256: 14.84 8.73 ( 41.17%) alignment=32, pos=256: 14.77 8.74 ( 40.81%) alignment=48, pos=256: 15.32 8.58 ( 43.98%) alignment=48, pos=256: 15.30 8.79 ( 42.57%) alignment=64, pos=256: 16.10 8.72 ( 45.84%) alignment=64, pos=256: 16.06 8.68 ( 45.98%) alignment=80, pos=256: 16.14 8.74 ( 45.82%) alignment=80, pos=256: 15.97 8.71 ( 45.47%) alignment=96, pos=256: 13.31 8.76 ( 34.23%) alignment=96, pos=256: 13.31 8.62 ( 35.25%) alignment=112, pos=256: 13.64 8.56 ( 37.27%) alignment=112, pos=256: 13.58 8.73 ( 35.75%) alignment=0, pos=0: 3.93 4.46 (-13.48%) alignment=0, pos=0: 4.00 4.04 ( -0.98%) alignment=0, pos=1: 4.00 4.00 ( -0.01%) alignment=0, pos=1: 4.00 4.00 ( 0.00%) alignment=0, pos=2: 4.00 4.04 ( -0.98%) alignment=0, pos=2: 4.00 4.04 ( -1.00%) alignment=0, pos=3: 4.00 4.00 ( 0.01%) alignment=0, pos=3: 4.00 4.00 ( 0.00%) alignment=0, pos=4: 4.00 4.04 ( -0.99%) alignment=0, pos=4: 4.36 4.45 ( -2.19%) alignment=0, pos=5: 4.00 4.00 ( 0.00%) alignment=0, pos=5: 4.00 4.00 ( 0.00%) alignment=0, pos=6: 4.00 4.00 ( 0.00%) alignment=0, pos=6: 4.00 4.04 ( -0.99%) alignment=0, pos=7: 4.00 4.00 ( 0.00%) alignment=0, pos=7: 4.00 4.00 ( -0.00%) alignment=0, pos=8: 4.31 4.46 ( -3.38%) alignment=0, pos=8: 4.32 4.44 ( -2.85%) alignment=0, pos=9: 4.00 4.00 ( 0.00%) alignment=0, pos=9: 4.00 4.04 ( -0.99%) alignment=0, pos=10: 4.21 4.46 ( -6.06%) alignment=0, pos=10: 4.00 4.00 ( 0.00%) alignment=0, pos=11: 4.23 4.46 ( -5.47%) alignment=0, pos=11: 4.00 4.00 ( 0.00%) alignment=0, pos=12: 4.15 4.51 ( -8.45%) alignment=0, pos=12: 4.00 4.00 ( 0.00%) alignment=0, pos=13: 4.26 4.49 ( -5.59%) alignment=0, pos=13: 4.17 4.45 ( -6.66%) alignment=0, pos=14: 4.16 4.49 ( -7.77%) alignment=0, pos=14: 4.00 4.00 ( 0.00%) alignment=0, pos=15: 4.17 4.45 ( -6.91%) alignment=0, pos=15: 4.17 4.49 ( -7.81%) alignment=0, pos=16: 4.00 4.00 ( -0.00%) alignment=0, pos=16: 4.17 4.44 ( -6.58%) alignment=0, pos=17: 4.17 4.44 ( -6.47%) alignment=0, pos=17: 4.16 4.45 ( -6.91%) alignment=0, pos=18: 4.17 4.45 ( -6.59%) alignment=0, pos=18: 4.13 4.46 ( -7.95%) alignment=0, pos=19: 4.15 4.47 ( -7.77%) alignment=0, pos=19: 4.13 4.45 ( -7.75%) alignment=0, pos=20: 4.16 4.43 ( -6.52%) alignment=0, pos=20: 4.16 4.44 ( -6.70%) alignment=0, pos=21: 4.13 4.46 ( -7.85%) alignment=0, pos=21: 4.17 4.44 ( -6.52%) alignment=0, pos=22: 4.21 4.46 ( -5.88%) alignment=0, pos=22: 4.18 4.46 ( -6.49%) alignment=0, pos=23: 4.13 4.44 ( -7.47%) alignment=0, pos=23: 4.14 4.46 ( -7.60%) alignment=0, pos=24: 4.00 4.04 ( -0.98%) alignment=0, pos=24: 4.15 4.48 ( -8.00%) alignment=0, pos=25: 4.18 4.45 ( -6.39%) alignment=0, pos=25: 4.17 4.46 ( -6.94%) alignment=0, pos=26: 4.15 4.44 ( -6.87%) alignment=0, pos=26: 4.16 4.47 ( -7.47%) alignment=0, pos=27: 4.16 4.49 ( -7.86%) alignment=0, pos=27: 4.19 4.46 ( -6.43%) alignment=0, pos=28: 4.15 4.45 ( -7.07%) alignment=0, pos=28: 4.14 4.45 ( -7.51%) alignment=0, pos=29: 4.14 4.49 ( -8.43%) alignment=0, pos=29: 4.21 4.45 ( -5.74%) alignment=0, pos=30: 4.16 4.43 ( -6.66%) alignment=0, pos=30: 4.15 4.49 ( -8.31%) alignment=0, pos=31: 4.16 4.54 ( -9.09%) alignment=0, pos=31: 4.15 4.49 ( -8.12%) alignment=0, pos=15: 4.67 4.71 ( -0.86%) alignment=0, pos=15: 5.11 5.15 ( -0.78%) alignment=0, pos=15: 4.83 5.11 ( -5.74%) alignment=0, pos=15: 5.20 5.54 ( -6.59%) alignment=0, pos=15: 4.81 5.11 ( -6.17%) alignment=0, pos=15: 5.22 5.54 ( -6.14%) alignment=0, pos=15: 4.82 5.11 ( -5.99%) alignment=0, pos=15: 5.24 5.59 ( -6.57%) alignment=0, pos=15: 4.84 5.12 ( -5.67%) alignment=0, pos=15: 5.24 5.59 ( -6.56%) alignment=0, pos=15: 4.81 5.11 ( -6.36%) alignment=0, pos=15: 5.22 5.55 ( -6.22%) alignment=0, pos=15: 4.81 5.11 ( -6.16%) alignment=0, pos=15: 5.24 5.55 ( -5.88%) alignment=0, pos=15: 4.78 5.15 ( -7.71%) alignment=0, pos=15: 5.21 5.56 ( -6.54%) alignment=0, pos=15: 4.82 5.12 ( -6.25%) alignment=0, pos=15: 5.23 5.55 ( -6.14%)
On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha > > <libc-alpha@sourceware.org> wrote: > > > > > > This patch implements following evex512 version of string functions. > > > evex512 version takes up to 30% less cycle as compared to evex, > > > depending on length and alignment. > > > > Please attach benchmark numbers. > > > > > > - strchrnul function using 512 bit vectors. > > > - strchr function using 512 bit vectors. > > > - wcschr function using 512 bit vectors. > > > > > > Code size data: > > > > > > strchrnul-evex.o 615 byte > > > strchrnul-evex512.o 573 byte (-7%) > > > > > > strchr-evex.o 670 byte > > > strchr-evex512.o 616 byte (-8%) > > > > > > wcschr-evex.o 678 byte > > > wcschr-evex512.o 620 byte (-9%) > > > > > > Placeholder function, not used by any processor at the moment. > > > --- > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 + > > > sysdeps/x86_64/multiarch/strchr-evex-base.S | 294 +++++++++++++++++++ > > > sysdeps/x86_64/multiarch/strchr-evex512.S | 7 + > > > sysdeps/x86_64/multiarch/strchrnul-evex512.S | 8 + > > > sysdeps/x86_64/multiarch/wcschr-evex512.S | 8 + > > > 6 files changed, 332 insertions(+) > > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S > > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > index df4601c294..89b58fa557 100644 > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > @@ -60,11 +60,13 @@ sysdep_routines += \ > > > strchr-avx2 \ > > > strchr-avx2-rtm \ > > > strchr-evex \ > > > + strchr-evex512 \ > > > strchr-sse2 \ > > > strchr-sse2-no-bsf \ > > > strchrnul-avx2 \ > > > strchrnul-avx2-rtm \ > > > strchrnul-evex \ > > > + strchrnul-evex512 \ > > > strchrnul-sse2 \ > > > strcmp-avx2 \ > > > strcmp-avx2-rtm \ > > > @@ -129,6 +131,7 @@ sysdep_routines += \ > > > wcschr-avx2 \ > > > wcschr-avx2-rtm \ > > > wcschr-evex \ > > > + wcschr-evex512 \ > > > wcschr-sse2 \ > > > wcscmp-avx2 \ > > > wcscmp-avx2-rtm \ > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > index a71444eccb..bce1d15171 100644 > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __strchr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > + __strchr_evex512) > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchr, > > > (CPU_FEATURE_USABLE (AVX2) > > > && CPU_FEATURE_USABLE (BMI2)), > > > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __strchrnul_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > + __strchrnul_evex512) > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul, > > > (CPU_FEATURE_USABLE (AVX2) > > > && CPU_FEATURE_USABLE (BMI2)), > > > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > && CPU_FEATURE_USABLE (AVX512BW) > > > && CPU_FEATURE_USABLE (BMI2)), > > > __wcschr_evex) > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > + __wcschr_evex512) > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr, > > > (CPU_FEATURE_USABLE (AVX2) > > > && CPU_FEATURE_USABLE (BMI2)), > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > > new file mode 100644 > > > index 0000000000..919dafc8b6 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > > @@ -0,0 +1,294 @@ > > > +/* Placeholder function, not used by any processor at the moment. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +/* UNUSED. Exists purely as reference implementation. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > +# include <sysdep.h> > > > + > > > +# ifdef USE_AS_WCSCHR > > > +# define CHAR_REG esi > > > +# define CHAR_SIZE 4 > > > +# define VPBROADCAST vpbroadcastd > > > +# define VPCMP vpcmpd > > > +# define VPMINU vpminud > > > +# define VPTESTN vptestnmd > > > +# else > > > +# define CHAR_REG sil > > > +# define CHAR_SIZE 1 > > > +# define VPBROADCAST vpbroadcastb > > > +# define VPCMP vpcmpb > > > +# define VPMINU vpminub > > > +# define VPTESTN vptestnmb > > > +# endif > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > +# define XMM1 xmm17 > > > + > > > +# if VEC_SIZE == 64 > > > +# define KMOV kmovq > > > +# define KORTEST kortestq > > > +# define RAX rax > > > +# define RCX rcx > > > +# define RDX rdx > > > +# define SHR shrq > > > +# define TEXTSUFFIX evex512 > > > +# define VMM0 zmm16 > > > +# define VMM1 zmm17 > > > +# define VMM2 zmm18 > > > +# define VMM3 zmm19 > > > +# define VMM4 zmm20 > > > +# define VMM5 zmm21 > > > +# define VMOVA vmovdqa64 > > > +# define VMOVU vmovdqu64 > > > + > > > +# elif VEC_SIZE == 32 > > > +/* Currently Unused. */ > > > +# define KMOV kmovd > > > +# define KORTEST kortestd > > > +# define RAX eax > > > +# define RCX ecx > > > +# define RDX edx > > > +# define SHR shrl > > > +# define TEXTSUFFIX evex256 > > > +# define VMM0 ymm16 > > > +# define VMM1 ymm17 > > > +# define VMM2 ymm18 > > > +# define VMM3 ymm19 > > > +# define VMM4 ymm20 > > > +# define VMM5 ymm21 > > > +# define VMOVA vmovdqa32 > > > +# define VMOVU vmovdqu32 > > > +# endif > > > + > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > +/* Aligning entry point to 64 byte, provides better performance for > > > + one vector length string. */ > > > +ENTRY_P2ALIGN (STRCHR, 6) > > > + > > > + /* Broadcast CHAR to VMM0. */ > > > + VPBROADCAST %esi, %VMM0 > > > + movl %edi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > + VMOVU (%rdi), %VMM1 > > > + > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + > > > + KMOV %k0, %RAX > > > +# ifndef USE_AS_STRCHRNUL > > > + test %RAX, %RAX > > > + jz L(align_more) > > > + bsf %RAX, %RAX > > > +# else > > > + /* For strchnul, using bsf, if string is less than 64 byte, > > > + entire logic will fit in 64 byte cache line and offset > > > + the perf gap as compared to evex version. Even though > > > + using bsf as condition can save code size but it is not > > > + preferred for conditional jump for 2 reason. 1) It's > > > + latency is 3. 2) Unlike test, it can't be micro-fused > > > + with jump. */ > > > + bsf %RAX, %RAX > > > + jz L(align_more) > > > +# endif > > > + > > > +# ifdef USE_AS_WCSCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + add %rdi, %rax > > > +# endif > > > +# ifndef USE_AS_STRCHRNUL > > > + cmp (%rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + ret > > > + > > > +# ifndef USE_AS_STRCHRNUL > > > +L(zero): > > > + xorl %eax, %eax > > > + ret > > > +# endif > > > + > > > +L(ret_vec_x2): > > > + subq $-VEC_SIZE, %rax > > > +L(ret_vec_x1): > > > + bsf %RCX, %RCX > > > +# ifdef USE_AS_WCSCHR > > > + leaq (%rax, %rcx, CHAR_SIZE), %rax > > > +# else > > > + add %rcx, %rax > > > +# endif > > > + > > > +# ifndef USE_AS_STRCHRNUL > > > + cmp (%rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + ret > > > + > > > +L(align_more): > > > + leaq VEC_SIZE(%rdi), %rax > > > + /* Align rax to VEC_SIZE. */ > > > + andq $-VEC_SIZE, %rax > > > + > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > + VMOVA (%rax), %VMM1 > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x1) > > > + > > > + VMOVA VEC_SIZE(%rax), %VMM1 > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x2) > > > + > > > + VMOVA (VEC_SIZE * 2)(%rax), %VMM1 > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x3) > > > + > > > + VMOVA (VEC_SIZE * 3)(%rax), %VMM1 > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x4) > > > + > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > + andq $-(VEC_SIZE * 4), %rax > > > + > > > + .p2align 4,,11 > > > +L(loop): > > > + /* VPMINU and VPCMP combination provide better performance as > > > + compared to alternative combinations. */ > > > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > > > + VMOVA (VEC_SIZE * 5)(%rax), %VMM2 > > > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > > > + VMOVA (VEC_SIZE * 7)(%rax), %VMM4 > > > + > > > + vpxorq %VMM1, %VMM0, %VMM5 > > > + VPMINU %VMM5, %VMM1, %VMM1 > > > + > > > + VPCMP $4, %VMM0, %VMM2, %k1 > > > + VPMINU %VMM1, %VMM2, %VMM2{%k1}{z} > > > + > > > + VPCMP $4, %VMM0, %VMM3, %k2 > > > + VPMINU %VMM2, %VMM3, %VMM3{%k2}{z} > > > + > > > + VPCMP $4, %VMM0, %VMM4, %k3 > > > + VPMINU %VMM3, %VMM4, %VMM4{%k3}{z} > > > + > > > + VPTESTN %VMM4, %VMM4, %k3 > > > + > > > + subq $-(VEC_SIZE * 4), %rax > > > + KORTEST %k3, %k3 > > > + jz L(loop) > > > + > > > + VPTESTN %VMM1, %VMM1, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x1) > > > + > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + KMOV %k0, %RCX > > > + /* At this point, if k1 is non zero, null char must be in the > > > + second vector. */ > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x2) > > > + > > > + VPTESTN %VMM3, %VMM3, %k0 > > > + KMOV %k0, %RCX > > > + test %RCX, %RCX > > > + jnz L(ret_vec_x3) > > > + /* At this point null [w]char must be in the fourth vector so no > > > + need to check. */ > > > + KMOV %k3, %RCX > > > + > > > +L(ret_vec_x4): > > > + bsf %RCX, %RCX > > > + leaq (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax > > > +# ifndef USE_AS_STRCHRNUL > > > + cmp (%rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + ret > > > + > > > +L(ret_vec_x3): > > > + bsf %RCX, %RCX > > > + leaq (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax > > > +# ifndef USE_AS_STRCHRNUL > > > + cmp (%rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + ret > > > + > > > +L(page_cross): > > > + movl %eax, %ecx > > > +# ifdef USE_AS_WCSCHR > > > + /* Calculate number of compare result bits to be skipped for > > > + wide string alignment adjustment. */ > > > + andl $(VEC_SIZE - 1), %ecx > > > + sarl $2, %ecx > > > +# endif > > > + /* ecx contains number of w[char] to be skipped as a result > > > + of address alignment. */ > > > + xorq %rdi, %rax > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > + VPTESTN %VMM2, %VMM2, %k0 > > > + KMOV %k0, %RAX > > > + /* Ignore number of character for alignment adjustment. */ > > > + SHR %cl, %RAX > > > + jz L(align_more) > > > + > > > + bsf %RAX, %RAX > > > +# ifdef USE_AS_WCSCHR > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + addq %rdi, %rax > > > +# endif > > > + > > > +# ifndef USE_AS_STRCHRNUL > > > + cmp (%rax), %CHAR_REG > > > + jne L(zero) > > > +# endif > > > + ret > > > + > > > +END (STRCHR) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S > > > new file mode 100644 > > > index 0000000000..4079bf387d > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S > > > @@ -0,0 +1,7 @@ > > > +# ifndef STRCHR > > > +# define STRCHR __strchr_evex512 > > > +# endif > > > + > > > +#define VEC_SIZE 64 > > > + > > > +#include "strchr-evex-base.S" > > > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > new file mode 100644 > > > index 0000000000..1be0b12f38 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > @@ -0,0 +1,8 @@ > > > +#ifndef STRCHRNUL > > > +# define STRCHRNUL __strchrnul_evex512 > > > +#endif > > > + > > > +#define STRCHR STRCHRNUL > > > +#define USE_AS_STRCHRNUL 1 > > > + > > > +#include "strchr-evex512.S" > > > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > > new file mode 100644 > > > index 0000000000..50c87ab1e5 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > > @@ -0,0 +1,8 @@ > > > +#ifndef WCSCHR > > > +# define WCSCHR __wcschr_evex512 > > > +#endif > > > + > > > +#define STRCHR WCSCHR > > > +#define USE_AS_WCSCHR 1 > > > + > > > +#include "strchr-evex512.S" > > > -- > > > 2.36.1 > > > ping
On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz > > > > > > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > > > On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha > > > <libc-alpha@sourceware.org> wrote: > > > > > > > > This patch implements following evex512 version of string functions. > > > > evex512 version takes up to 30% less cycle as compared to evex, > > > > depending on length and alignment. > > > > > > Please attach benchmark numbers. > > > > > > > > - strchrnul function using 512 bit vectors. > > > > - strchr function using 512 bit vectors. > > > > - wcschr function using 512 bit vectors. > > > > > > > > Code size data: > > > > > > > > strchrnul-evex.o 615 byte > > > > strchrnul-evex512.o 573 byte (-7%) > > > > > > > > strchr-evex.o 670 byte > > > > strchr-evex512.o 616 byte (-8%) > > > > > > > > wcschr-evex.o 678 byte > > > > wcschr-evex512.o 620 byte (-9%) > > > > > > > > Placeholder function, not used by any processor at the moment. > > > > --- > > > > sysdeps/x86_64/multiarch/Makefile | 3 + > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 + > > > > sysdeps/x86_64/multiarch/strchr-evex-base.S | 294 +++++++++++++++++++ > > > > sysdeps/x86_64/multiarch/strchr-evex512.S | 7 + > > > > sysdeps/x86_64/multiarch/strchrnul-evex512.S | 8 + > > > > sysdeps/x86_64/multiarch/wcschr-evex512.S | 8 + > > > > 6 files changed, 332 insertions(+) > > > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S > > > > create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > > create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > > > index df4601c294..89b58fa557 100644 > > > > --- a/sysdeps/x86_64/multiarch/Makefile > > > > +++ b/sysdeps/x86_64/multiarch/Makefile > > > > @@ -60,11 +60,13 @@ sysdep_routines += \ > > > > strchr-avx2 \ > > > > strchr-avx2-rtm \ > > > > strchr-evex \ > > > > + strchr-evex512 \ > > > > strchr-sse2 \ > > > > strchr-sse2-no-bsf \ > > > > strchrnul-avx2 \ > > > > strchrnul-avx2-rtm \ > > > > strchrnul-evex \ > > > > + strchrnul-evex512 \ > > > > strchrnul-sse2 \ > > > > strcmp-avx2 \ > > > > strcmp-avx2-rtm \ > > > > @@ -129,6 +131,7 @@ sysdep_routines += \ > > > > wcschr-avx2 \ > > > > wcschr-avx2-rtm \ > > > > wcschr-evex \ > > > > + wcschr-evex512 \ > > > > wcschr-sse2 \ > > > > wcscmp-avx2 \ > > > > wcscmp-avx2-rtm \ > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > index a71444eccb..bce1d15171 100644 > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > > > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __strchr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > > + __strchr_evex512) > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchr, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __strchrnul_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > > + __strchrnul_evex512) > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > > > && CPU_FEATURE_USABLE (AVX512BW) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > __wcschr_evex) > > > > + X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, > > > > + (CPU_FEATURE_USABLE (AVX512VL) > > > > + && CPU_FEATURE_USABLE (AVX512BW)), > > > > + __wcschr_evex512) > > > > X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr, > > > > (CPU_FEATURE_USABLE (AVX2) > > > > && CPU_FEATURE_USABLE (BMI2)), > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > > > new file mode 100644 > > > > index 0000000000..919dafc8b6 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S > > > > @@ -0,0 +1,294 @@ > > > > +/* Placeholder function, not used by any processor at the moment. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +/* UNUSED. Exists purely as reference implementation. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifdef USE_AS_WCSCHR > > > > +# define CHAR_REG esi > > > > +# define CHAR_SIZE 4 > > > > +# define VPBROADCAST vpbroadcastd > > > > +# define VPCMP vpcmpd > > > > +# define VPMINU vpminud > > > > +# define VPTESTN vptestnmd > > > > +# else > > > > +# define CHAR_REG sil > > > > +# define CHAR_SIZE 1 > > > > +# define VPBROADCAST vpbroadcastb > > > > +# define VPCMP vpcmpb > > > > +# define VPMINU vpminub > > > > +# define VPTESTN vptestnmb > > > > +# endif > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > +# define XMM1 xmm17 > > > > + > > > > +# if VEC_SIZE == 64 > > > > +# define KMOV kmovq > > > > +# define KORTEST kortestq > > > > +# define RAX rax > > > > +# define RCX rcx > > > > +# define RDX rdx > > > > +# define SHR shrq > > > > +# define TEXTSUFFIX evex512 > > > > +# define VMM0 zmm16 > > > > +# define VMM1 zmm17 > > > > +# define VMM2 zmm18 > > > > +# define VMM3 zmm19 > > > > +# define VMM4 zmm20 > > > > +# define VMM5 zmm21 > > > > +# define VMOVA vmovdqa64 > > > > +# define VMOVU vmovdqu64 > > > > + > > > > +# elif VEC_SIZE == 32 > > > > +/* Currently Unused. */ > > > > +# define KMOV kmovd > > > > +# define KORTEST kortestd > > > > +# define RAX eax > > > > +# define RCX ecx > > > > +# define RDX edx > > > > +# define SHR shrl > > > > +# define TEXTSUFFIX evex256 > > > > +# define VMM0 ymm16 > > > > +# define VMM1 ymm17 > > > > +# define VMM2 ymm18 > > > > +# define VMM3 ymm19 > > > > +# define VMM4 ymm20 > > > > +# define VMM5 ymm21 > > > > +# define VMOVA vmovdqa32 > > > > +# define VMOVU vmovdqu32 > > > > +# endif > > > > + > > > > + .section .text.TEXTSUFFIX, "ax", @progbits > > > > +/* Aligning entry point to 64 byte, provides better performance for > > > > + one vector length string. */ > > > > +ENTRY_P2ALIGN (STRCHR, 6) > > > > + > > > > + /* Broadcast CHAR to VMM0. */ > > > > + VPBROADCAST %esi, %VMM0 > > > > + movl %edi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > + /* Compare [w]char for null, mask bit will be set for match. */ > > > > + VMOVU (%rdi), %VMM1 > > > > + > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + > > > > + KMOV %k0, %RAX > > > > +# ifndef USE_AS_STRCHRNUL > > > > + test %RAX, %RAX > > > > + jz L(align_more) > > > > + bsf %RAX, %RAX > > > > +# else > > > > + /* For strchnul, using bsf, if string is less than 64 byte, > > > > + entire logic will fit in 64 byte cache line and offset > > > > + the perf gap as compared to evex version. Even though > > > > + using bsf as condition can save code size but it is not > > > > + preferred for conditional jump for 2 reason. 1) It's > > > > + latency is 3. 2) Unlike test, it can't be micro-fused > > > > + with jump. */ > > > > + bsf %RAX, %RAX > > > > + jz L(align_more) > > > > +# endif > > > > + > > > > +# ifdef USE_AS_WCSCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + add %rdi, %rax > > > > +# endif > > > > +# ifndef USE_AS_STRCHRNUL > > > > + cmp (%rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + ret > > > > + > > > > +# ifndef USE_AS_STRCHRNUL > > > > +L(zero): > > > > + xorl %eax, %eax > > > > + ret > > > > +# endif > > > > + > > > > +L(ret_vec_x2): > > > > + subq $-VEC_SIZE, %rax > > > > +L(ret_vec_x1): > > > > + bsf %RCX, %RCX > > > > +# ifdef USE_AS_WCSCHR > > > > + leaq (%rax, %rcx, CHAR_SIZE), %rax > > > > +# else > > > > + add %rcx, %rax > > > > +# endif > > > > + > > > > +# ifndef USE_AS_STRCHRNUL > > > > + cmp (%rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + ret > > > > + > > > > +L(align_more): > > > > + leaq VEC_SIZE(%rdi), %rax > > > > + /* Align rax to VEC_SIZE. */ > > > > + andq $-VEC_SIZE, %rax > > > > + > > > > + /* Loop unroll 4 times for 4 vector loop. */ > > > > + VMOVA (%rax), %VMM1 > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x1) > > > > + > > > > + VMOVA VEC_SIZE(%rax), %VMM1 > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x2) > > > > + > > > > + VMOVA (VEC_SIZE * 2)(%rax), %VMM1 > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x3) > > > > + > > > > + VMOVA (VEC_SIZE * 3)(%rax), %VMM1 > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x4) > > > > + > > > > + /* Align address to VEC_SIZE * 4 for loop. */ > > > > + andq $-(VEC_SIZE * 4), %rax > > > > + > > > > + .p2align 4,,11 > > > > +L(loop): > > > > + /* VPMINU and VPCMP combination provide better performance as > > > > + compared to alternative combinations. */ > > > > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > > > > + VMOVA (VEC_SIZE * 5)(%rax), %VMM2 > > > > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > > > > + VMOVA (VEC_SIZE * 7)(%rax), %VMM4 > > > > + > > > > + vpxorq %VMM1, %VMM0, %VMM5 > > > > + VPMINU %VMM5, %VMM1, %VMM1 > > > > + > > > > + VPCMP $4, %VMM0, %VMM2, %k1 > > > > + VPMINU %VMM1, %VMM2, %VMM2{%k1}{z} > > > > + > > > > + VPCMP $4, %VMM0, %VMM3, %k2 > > > > + VPMINU %VMM2, %VMM3, %VMM3{%k2}{z} > > > > + > > > > + VPCMP $4, %VMM0, %VMM4, %k3 > > > > + VPMINU %VMM3, %VMM4, %VMM4{%k3}{z} > > > > + > > > > + VPTESTN %VMM4, %VMM4, %k3 > > > > + > > > > + subq $-(VEC_SIZE * 4), %rax > > > > + KORTEST %k3, %k3 > > > > + jz L(loop) > > > > + > > > > + VPTESTN %VMM1, %VMM1, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x1) > > > > + > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + KMOV %k0, %RCX > > > > + /* At this point, if k1 is non zero, null char must be in the > > > > + second vector. */ > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x2) > > > > + > > > > + VPTESTN %VMM3, %VMM3, %k0 > > > > + KMOV %k0, %RCX > > > > + test %RCX, %RCX > > > > + jnz L(ret_vec_x3) > > > > + /* At this point null [w]char must be in the fourth vector so no > > > > + need to check. */ > > > > + KMOV %k3, %RCX > > > > + > > > > +L(ret_vec_x4): > > > > + bsf %RCX, %RCX > > > > + leaq (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax > > > > +# ifndef USE_AS_STRCHRNUL > > > > + cmp (%rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + ret > > > > + > > > > +L(ret_vec_x3): > > > > + bsf %RCX, %RCX > > > > + leaq (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax > > > > +# ifndef USE_AS_STRCHRNUL > > > > + cmp (%rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + ret > > > > + > > > > +L(page_cross): > > > > + movl %eax, %ecx > > > > +# ifdef USE_AS_WCSCHR > > > > + /* Calculate number of compare result bits to be skipped for > > > > + wide string alignment adjustment. */ > > > > + andl $(VEC_SIZE - 1), %ecx > > > > + sarl $2, %ecx > > > > +# endif > > > > + /* ecx contains number of w[char] to be skipped as a result > > > > + of address alignment. */ > > > > + xorq %rdi, %rax > > > > + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 > > > > + vpxorq %VMM1, %VMM0, %VMM2 > > > > + VPMINU %VMM2, %VMM1, %VMM2 > > > > + VPTESTN %VMM2, %VMM2, %k0 > > > > + KMOV %k0, %RAX > > > > + /* Ignore number of character for alignment adjustment. */ > > > > + SHR %cl, %RAX > > > > + jz L(align_more) > > > > + > > > > + bsf %RAX, %RAX > > > > +# ifdef USE_AS_WCSCHR > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + addq %rdi, %rax > > > > +# endif > > > > + > > > > +# ifndef USE_AS_STRCHRNUL > > > > + cmp (%rax), %CHAR_REG > > > > + jne L(zero) > > > > +# endif > > > > + ret > > > > + > > > > +END (STRCHR) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..4079bf387d > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S > > > > @@ -0,0 +1,7 @@ > > > > +# ifndef STRCHR > > > > +# define STRCHR __strchr_evex512 > > > > +# endif > > > > + > > > > +#define VEC_SIZE 64 > > > > + > > > > +#include "strchr-evex-base.S" > > > > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > > new file mode 100644 > > > > index 0000000000..1be0b12f38 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +#ifndef STRCHRNUL > > > > +# define STRCHRNUL __strchrnul_evex512 > > > > +#endif > > > > + > > > > +#define STRCHR STRCHRNUL > > > > +#define USE_AS_STRCHRNUL 1 > > > > + > > > > +#include "strchr-evex512.S" > > > > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > > > new file mode 100644 > > > > index 0000000000..50c87ab1e5 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S > > > > @@ -0,0 +1,8 @@ > > > > +#ifndef WCSCHR > > > > +# define WCSCHR __wcschr_evex512 > > > > +#endif > > > > + > > > > +#define STRCHR WCSCHR > > > > +#define USE_AS_WCSCHR 1 > > > > + > > > > +#include "strchr-evex512.S" > > > > -- > > > > 2.36.1 > > > > > > ping see my reply to strrchr.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index df4601c294..89b58fa557 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -60,11 +60,13 @@ sysdep_routines += \ strchr-avx2 \ strchr-avx2-rtm \ strchr-evex \ + strchr-evex512 \ strchr-sse2 \ strchr-sse2-no-bsf \ strchrnul-avx2 \ strchrnul-avx2-rtm \ strchrnul-evex \ + strchrnul-evex512 \ strchrnul-sse2 \ strcmp-avx2 \ strcmp-avx2-rtm \ @@ -129,6 +131,7 @@ sysdep_routines += \ wcschr-avx2 \ wcschr-avx2-rtm \ wcschr-evex \ + wcschr-evex512 \ wcschr-sse2 \ wcscmp-avx2 \ wcscmp-avx2-rtm \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index a71444eccb..bce1d15171 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __strchr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, strchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strchr_evex512) X86_IFUNC_IMPL_ADD_V3 (array, i, strchr, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __strchrnul_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __strchrnul_evex512) X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (AVX512BW) && CPU_FEATURE_USABLE (BMI2)), __wcschr_evex) + X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), + __wcschr_evex512) X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (BMI2)), diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S new file mode 100644 index 0000000000..919dafc8b6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S @@ -0,0 +1,294 @@ +/* Placeholder function, not used by any processor at the moment. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* UNUSED. Exists purely as reference implementation. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + +# include <sysdep.h> + +# ifdef USE_AS_WCSCHR +# define CHAR_REG esi +# define CHAR_SIZE 4 +# define VPBROADCAST vpbroadcastd +# define VPCMP vpcmpd +# define VPMINU vpminud +# define VPTESTN vptestnmd +# else +# define CHAR_REG sil +# define CHAR_SIZE 1 +# define VPBROADCAST vpbroadcastb +# define VPCMP vpcmpb +# define VPMINU vpminub +# define VPTESTN vptestnmb +# endif + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define XMM1 xmm17 + +# if VEC_SIZE == 64 +# define KMOV kmovq +# define KORTEST kortestq +# define RAX rax +# define RCX rcx +# define RDX rdx +# define SHR shrq +# define TEXTSUFFIX evex512 +# define VMM0 zmm16 +# define VMM1 zmm17 +# define VMM2 zmm18 +# define VMM3 zmm19 +# define VMM4 zmm20 +# define VMM5 zmm21 +# define VMOVA vmovdqa64 +# define VMOVU vmovdqu64 + +# elif VEC_SIZE == 32 +/* Currently Unused. */ +# define KMOV kmovd +# define KORTEST kortestd +# define RAX eax +# define RCX ecx +# define RDX edx +# define SHR shrl +# define TEXTSUFFIX evex256 +# define VMM0 ymm16 +# define VMM1 ymm17 +# define VMM2 ymm18 +# define VMM3 ymm19 +# define VMM4 ymm20 +# define VMM5 ymm21 +# define VMOVA vmovdqa32 +# define VMOVU vmovdqu32 +# endif + + .section .text.TEXTSUFFIX, "ax", @progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (STRCHR, 6) + + /* Broadcast CHAR to VMM0. */ + VPBROADCAST %esi, %VMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + + /* Compare [w]char for null, mask bit will be set for match. */ + VMOVU (%rdi), %VMM1 + + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + + KMOV %k0, %RAX +# ifndef USE_AS_STRCHRNUL + test %RAX, %RAX + jz L(align_more) + bsf %RAX, %RAX +# else + /* For strchnul, using bsf, if string is less than 64 byte, + entire logic will fit in 64 byte cache line and offset + the perf gap as compared to evex version. Even though + using bsf as condition can save code size but it is not + preferred for conditional jump for 2 reason. 1) It's + latency is 3. 2) Unlike test, it can't be micro-fused + with jump. */ + bsf %RAX, %RAX + jz L(align_more) +# endif + +# ifdef USE_AS_WCSCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + add %rdi, %rax +# endif +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + jne L(zero) +# endif + ret + +# ifndef USE_AS_STRCHRNUL +L(zero): + xorl %eax, %eax + ret +# endif + +L(ret_vec_x2): + subq $-VEC_SIZE, %rax +L(ret_vec_x1): + bsf %RCX, %RCX +# ifdef USE_AS_WCSCHR + leaq (%rax, %rcx, CHAR_SIZE), %rax +# else + add %rcx, %rax +# endif + +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + jne L(zero) +# endif + ret + +L(align_more): + leaq VEC_SIZE(%rdi), %rax + /* Align rax to VEC_SIZE. */ + andq $-VEC_SIZE, %rax + + /* Loop unroll 4 times for 4 vector loop. */ + VMOVA (%rax), %VMM1 + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x1) + + VMOVA VEC_SIZE(%rax), %VMM1 + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x2) + + VMOVA (VEC_SIZE * 2)(%rax), %VMM1 + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 3)(%rax), %VMM1 + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x4) + + /* Align address to VEC_SIZE * 4 for loop. */ + andq $-(VEC_SIZE * 4), %rax + + .p2align 4,,11 +L(loop): + /* VPMINU and VPCMP combination provide better performance as + compared to alternative combinations. */ + VMOVA (VEC_SIZE * 4)(%rax), %VMM1 + VMOVA (VEC_SIZE * 5)(%rax), %VMM2 + VMOVA (VEC_SIZE * 6)(%rax), %VMM3 + VMOVA (VEC_SIZE * 7)(%rax), %VMM4 + + vpxorq %VMM1, %VMM0, %VMM5 + VPMINU %VMM5, %VMM1, %VMM1 + + VPCMP $4, %VMM0, %VMM2, %k1 + VPMINU %VMM1, %VMM2, %VMM2{%k1}{z} + + VPCMP $4, %VMM0, %VMM3, %k2 + VPMINU %VMM2, %VMM3, %VMM3{%k2}{z} + + VPCMP $4, %VMM0, %VMM4, %k3 + VPMINU %VMM3, %VMM4, %VMM4{%k3}{z} + + VPTESTN %VMM4, %VMM4, %k3 + + subq $-(VEC_SIZE * 4), %rax + KORTEST %k3, %k3 + jz L(loop) + + VPTESTN %VMM1, %VMM1, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x1) + + VPTESTN %VMM2, %VMM2, %k0 + KMOV %k0, %RCX + /* At this point, if k1 is non zero, null char must be in the + second vector. */ + test %RCX, %RCX + jnz L(ret_vec_x2) + + VPTESTN %VMM3, %VMM3, %k0 + KMOV %k0, %RCX + test %RCX, %RCX + jnz L(ret_vec_x3) + /* At this point null [w]char must be in the fourth vector so no + need to check. */ + KMOV %k3, %RCX + +L(ret_vec_x4): + bsf %RCX, %RCX + leaq (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + jne L(zero) +# endif + ret + +L(ret_vec_x3): + bsf %RCX, %RCX + leaq (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + jne L(zero) +# endif + ret + +L(page_cross): + movl %eax, %ecx +# ifdef USE_AS_WCSCHR + /* Calculate number of compare result bits to be skipped for + wide string alignment adjustment. */ + andl $(VEC_SIZE - 1), %ecx + sarl $2, %ecx +# endif + /* ecx contains number of w[char] to be skipped as a result + of address alignment. */ + xorq %rdi, %rax + VMOVA (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1 + vpxorq %VMM1, %VMM0, %VMM2 + VPMINU %VMM2, %VMM1, %VMM2 + VPTESTN %VMM2, %VMM2, %k0 + KMOV %k0, %RAX + /* Ignore number of character for alignment adjustment. */ + SHR %cl, %RAX + jz L(align_more) + + bsf %RAX, %RAX +# ifdef USE_AS_WCSCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif + +# ifndef USE_AS_STRCHRNUL + cmp (%rax), %CHAR_REG + jne L(zero) +# endif + ret + +END (STRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S new file mode 100644 index 0000000000..4079bf387d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S @@ -0,0 +1,7 @@ +# ifndef STRCHR +# define STRCHR __strchr_evex512 +# endif + +#define VEC_SIZE 64 + +#include "strchr-evex-base.S" diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S new file mode 100644 index 0000000000..1be0b12f38 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S @@ -0,0 +1,8 @@ +#ifndef STRCHRNUL +# define STRCHRNUL __strchrnul_evex512 +#endif + +#define STRCHR STRCHRNUL +#define USE_AS_STRCHRNUL 1 + +#include "strchr-evex512.S" diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S new file mode 100644 index 0000000000..50c87ab1e5 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S @@ -0,0 +1,8 @@ +#ifndef WCSCHR +# define WCSCHR __wcschr_evex512 +#endif + +#define STRCHR WCSCHR +#define USE_AS_WCSCHR 1 + +#include "strchr-evex512.S"