Message ID | 20230630204812.2059831-1-skpgkp2@gmail.com |
---|---|
State | New |
Headers | show |
Series | x86_64: Implement AVX2 version of strlcpy/wcslcpy function | expand |
On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > --- > sysdeps/x86_64/multiarch/Makefile | 4 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > 9 files changed, 627 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e1e894c963..7e3fc081df 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -82,6 +82,8 @@ sysdep_routines += \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > strcspn-sse4 \ > + strlcpy-avx2 \ > + strlcpy-generic \ > strlen-avx2 \ > strlen-avx2-rtm \ > strlen-evex \ > @@ -153,6 +155,8 @@ sysdep_routines += \ > wcscpy-evex \ > wcscpy-generic \ > wcscpy-ssse3 \ > + wcslcpy-avx2 \ > + wcslcpy-generic \ > wcslen-avx2 \ > wcslen-avx2-rtm \ > wcslen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 5427ff1907..9928dee187 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __strncat_sse2_unaligned)) > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > + IFUNC_IMPL (i, name, strlcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > + CPU_FEATURE_USABLE (AVX2), > + __strlcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > + 1, > + __strlcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __wcscpy_generic)) > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > + IFUNC_IMPL (i, name, wcslcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > + CPU_FEATURE_USABLE (AVX2), > + __wcslcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > + 1, > + __wcslcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > IFUNC_IMPL (i, name, wcsncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > new file mode 100644 > index 0000000000..982a30d15b > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > @@ -0,0 +1,34 @@ > +/* Common definition for ifunc selections. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + return OPTIMIZE (avx2); > + > + return OPTIMIZE (generic); > +} > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > new file mode 100644 > index 0000000000..cf54b1e990 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > @@ -0,0 +1,446 @@ > +/* Strlcpy/wcslcpy optimized with AVX2. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRLCPY > +# define STRLCPY __strlcpy_avx2 > +# endif > + > + > +# ifdef USE_AS_WCSLCPY > +# define CHAR_SIZE 4 > +# define MOVU movl > +# define VPCMPEQ vpcmpeqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define MOVU movb > +# define VPCMPEQ vpcmpeqb > +# define VPMINU vpminub > +# endif > + > +# define PMOVMSK vpmovmskb > +# define PAGE_SIZE 4096 > +# define VEC_SIZE 32 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text),"ax",@progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > + > +ENTRY_P2ALIGN (STRLCPY, 6) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > + > + /* Zero out vector register for end of string comparison. */ > + vpxor %VMM(0), %VMM(0), %VMM(0) > + /* Save source pointer for return calculation. */ > + mov %rsi, %r8 > + mov %esi, %eax > + sall $20, %eax > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + /* Load first vector. */ > + VMOVU (%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + test %eax, %eax > + jnz L(ret_vec_x1) > + > + test %rdx, %rdx > + jz L(continue_second_vector) > + > + /* Check whether we can copy full vector. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(page_cross_small_vec_copy) > + /* Copy first vector. */ > + VMOVU %VMM(1), (%rdi) > + sub $CHAR_PER_VEC, %rdx > + > +L(continue_second_vector): > + /* Align RSI pointer and adjust RDI based on offset. */ > + mov %rsi, %rax > + and $-VEC_SIZE, %rsi > + sub %rsi, %rax > + sub %rax, %rdi > + > + /* Check if string already copied N char, and RDX is 0. */ > + test %rdx, %rdx > + jz L(skip_copy_alignment_fix) > + > + /* Adjust RDX for copy alignment fix. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + add %rax, %rdx > + > +L(skip_copy_alignment_fix): > + /* Load second vector. */ > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x2) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_third_vector) > + > + /* Jump below/equal(instead of below) used here, because last > + copy chracter must be NULL. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_second_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy second vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + > +L(continue_third_vector): > + /* Load third vector. */ > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x3) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_fourth_vector) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_third_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy third vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > + > +L(continue_fourth_vector): > + /* Load fourth vector. */ > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x4) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_align) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_fourth_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy fourth vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > + > + > +L(loop_4x_align): > + /* Jump to loop if RSI is already 4 vector align. */ > + test $(VEC_SIZE * 4 - 1), %esi > + jz L(loop_4x_read) > + > + mov %rsi, %rcx > + > + /* Align RSI to 4x vector. */ > + and $(VEC_SIZE * -4), %rsi > + sub %rsi, %rcx > + > + /* Adjust RDI for RSI alignment fix. */ > + sub %rcx, %rdi > + > + /* Jump to loop if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_read) > + > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + > + /* Adjust RDX for RSI alignment fix. */ > + add %rcx, %rdx > + jmp L(loop_4x_read) > + > + .p2align 4,,6 > +L(loop_4x_vec): > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_partial_copy_return) > + cmp $(CHAR_PER_VEC * 4), %rdx > + jbe L(loop_partial_copy) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > + sub $(CHAR_PER_VEC * 4), %rdx > + > +L(loop_partial_copy_return): > + sub $(VEC_SIZE * -4), %rsi > + sub $(VEC_SIZE * -4), %rdi > + > +L(loop_4x_read): > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > + VPMINU %VMM(1), %VMM(2), %VMM(5) > + VPMINU %VMM(3), %VMM(4), %VMM(6) > + VPMINU %VMM(5), %VMM(6), %VMM(7) > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > + vptest %VMM(7), %VMM(7) > + > + jz L(loop_4x_vec) > + > + /* Check if string ends in first vector or second vector. */ > + lea (VEC_SIZE * 4)(%rsi), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + xor %r10, %r10 > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > + vptest %VMM(6), %VMM(6) > + jnz L(endloop) > + sub $(CHAR_PER_VEC * -2), %rax > + mov $(CHAR_PER_VEC * 2), %r10 > + VMOVA %VMM(3), %VMM(1) > + VMOVA %VMM(4), %VMM(2) > + > +L(endloop): > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > + PMOVMSK %VMM(1), %rcx > + PMOVMSK %VMM(2), %r9 > + shlq $32, %r9 > + orq %r9, %rcx > + bsf %rcx, %rcx > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + /* At this point RAX has length to return. */ > + add %rcx, %rax > + test %rdx, %rdx > + jz L(ret) > + > + /* Add 1 to account for NULL character in RDX comparison. */ > + lea 1(%r10, %rcx), %rcx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(loop_partial_copy): > + cmp $(CHAR_PER_VEC * 2), %rdx > + jbe L(loop_partial_first_half) > + /* Reload first 2 vector. */ > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + > +L(loop_partial_first_half): > + /* Go back 2 vector from last and use overlapping copy. > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > + */ > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(7), %VMM(7) > + jz L(loop_partial_copy_return) > + ret > + > + .p2align 4 > +L(page_cross): > + mov %rsi, %rcx > + mov %rsi, %r11 > + and $-VEC_SIZE, %r11 > + and $(VEC_SIZE - 1), %rcx > + VMOVA (%r11), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + shr %cl, %eax > + jz L(page_cross_continue) > + > +L(ret_vec_x1): > + bsf %eax, %eax > +# ifdef USE_AS_WCSLCPY > + shr $2, %eax > +# endif > + /* Increment by 1 to account for NULL char. */ > + lea 1(%eax), %ecx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + test %rdx, %rdx > + jz L(ret) > + > +L(page_cross_small_vec_copy): > + cmp $(16 / CHAR_SIZE), %rdx > + jbe L(copy_8_byte_scalar) > + VMOVU (%rsi), %VMM_128(1) > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > + VMOVU %VMM_128(1), (%rdi) > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_8_byte_scalar): > + cmp $(8 / CHAR_SIZE), %rdx > + jbe L(copy_4_byte_scalar) > + movq (%rsi), %r10 > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > + movq %r10, (%rdi) > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_4_byte_scalar): > +# ifndef USE_AS_WCSLCPY > + cmp $4, %rdx > + jbe L(copy_2_byte_scalar) > +# endif > + movl (%rsi), %r10d > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > + movl %r10d, (%rdi) > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +# ifndef USE_AS_WCSLCPY > +L(copy_2_byte_scalar): > + cmp $2, %rdx > + jbe L(copy_1_byte_scalar) > + movw (%rsi), %r10w > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > + movw %r10w, (%rdi) > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_1_byte_scalar): > + MOVU (%rsi), %r10b > + MOVU %r10b, (%rdi) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > +# endif > + > +L(ret_vec_x2): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea VEC_SIZE(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_second_vector): > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_third_vector) > + > +L(ret): > + ret > + > +L(ret_vec_x3): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_third_vector): > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +L(ret_vec_x4): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_fourth_vector): > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +END (STRLCPY) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c > new file mode 100644 > index 0000000000..eee3b7b086 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > @@ -0,0 +1,25 @@ > +/* strlcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __strlcpy __strlcpy_generic > +# include <string/strlcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c > new file mode 100644 > index 0000000000..ded41fbcfb > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > @@ -0,0 +1,36 @@ > +/* Multiple versions of strlcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __strlcpy __redirect_strlcpy > +# include <string.h> > +# undef __strlcpy > + > +# define SYMBOL_NAME strlcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); > +weak_alias (__strlcpy, strlcpy) > + > +# ifdef SHARED > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > +# endif > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > new file mode 100644 > index 0000000000..dafc20ded0 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > @@ -0,0 +1,4 @@ > +#define STRLCPY __wcslcpy_avx2 > +#define USE_AS_WCSLCPY 1 > + > +#include "strlcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > new file mode 100644 > index 0000000000..ffd3c0e846 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > @@ -0,0 +1,25 @@ > +/* wcslcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __wcslcpy __wcslcpy_generic > +# include <wcsmbs/wcslcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c > new file mode 100644 > index 0000000000..371ef9626c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > @@ -0,0 +1,35 @@ > +/* Multiple versions of wcslcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __wcslcpy __redirect_wcslcpy > +# include <wchar.h> > +# undef __wcslcpy > + > +# define SYMBOL_NAME wcslcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); > +weak_alias (__wcslcpy, wcslcpy) > +# ifdef SHARED > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > +# endif > +#endif > -- > 2.38.1 > Think we should at the very least wait for the generic strlcpy codes to land first.
On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > Think we should at the very least wait for the generic strlcpy codes > to land first. Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit.
On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: > On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > > Think we should at the very least wait for the generic strlcpy codes > > to land first. > > Let's not optimize these functions at all, unless there's good and > measured reason to do so. In practice I expected they're called with > small sizes for which optimization is a net minus as it consumes > valuable maintenance time with no real benefit. > Hi Paul, Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html Thanks, Sunil Function: wcslcpy Variant: __wcslcpy_avx2 __wcslcpy_generic ======================================================================================================================== len=16, align1=1, align2=1, n=16: 14.99 ( 24.63%) 19.89 len=16, align1=1, align2=1, n=16: 14.58 ( 19.61%) 18.13 len=16, align1=1, align2=2, n=16: 16.99 ( 4.02%) 17.70 len=16, align1=2, align2=1, n=16: 15.14 ( 17.08%) 18.25 len=2, align1=7, align2=2, n=4: 8.40 ( 44.41%) 15.11 len=4, align1=2, align2=7, n=2: 10.91 ( 42.41%) 18.95 len=2, align1=7, align2=2, n=4: 8.92 ( 34.99%) 13.72 len=4, align1=2, align2=7, n=2: 10.92 ( 42.05%) 18.84 len=16, align1=2, align2=2, n=16: 15.70 ( 11.97%) 17.84 len=16, align1=2, align2=2, n=16: 14.83 ( 16.82%) 17.83 len=16, align1=2, align2=4, n=16: 17.30 ( -0.46%) 17.22 len=16, align1=4, align2=2, n=16: 15.44 ( 15.20%) 18.21 len=4, align1=6, align2=4, n=8: 12.87 ( 14.74%) 15.09 len=8, align1=4, align2=6, n=4: 13.72 ( 25.95%) 18.53 len=4, align1=6, align2=4, n=8: 12.85 ( 9.03%) 14.13 len=8, align1=4, align2=6, n=4: 12.67 ( 31.60%) 18.52 len=16, align1=3, align2=3, n=16: 14.57 ( 15.76%) 17.30 len=16, align1=3, align2=3, n=16: 14.82 ( 14.03%) 17.23 len=16, align1=3, align2=6, n=16: 17.02 ( 3.24%) 17.59 len=16, align1=6, align2=3, n=16: 15.04 ( 19.50%) 18.68 len=8, align1=5, align2=6, n=16: 14.96 ( 8.00%) 16.26 len=16, align1=6, align2=5, n=8: 13.70 ( 25.56%) 18.41 len=8, align1=5, align2=6, n=16: 14.54 ( 7.87%) 15.78 len=16, align1=6, align2=5, n=8: 12.35 ( 24.15%) 16.28 len=16, align1=4, align2=4, n=16: 13.93 ( 14.41%) 16.28 len=16, align1=4, align2=4, n=16: 13.63 ( 16.32%) 16.29 len=16, align1=4, align2=0, n=16: 12.97 ( 21.40%) 16.51 len=16, align1=0, align2=4, n=16: 14.09 ( 15.59%) 16.70 len=16, align1=4, align2=0, n=32: 13.75 ( 31.95%) 20.20 len=32, align1=0, align2=4, n=16: 19.20 ( -0.01%) 19.20 len=16, align1=4, align2=0, n=32: 14.45 ( 31.46%) 21.08 len=32, align1=0, align2=4, n=16: 19.07 ( -1.55%) 18.78 len=16, align1=5, align2=5, n=16: 14.89 ( 15.97%) 17.72 len=16, align1=5, align2=5, n=16: 13.78 ( 15.12%) 16.23 len=16, align1=5, align2=2, n=16: 14.89 ( 13.72%) 17.26 len=16, align1=2, align2=5, n=16: 17.32 ( -0.72%) 17.20 len=32, align1=3, align2=2, n=64: 23.78 ( 20.84%) 30.05 len=64, align1=2, align2=3, n=32: 24.54 ( 0.82%) 24.74 len=32, align1=3, align2=2, n=64: 22.48 ( 17.99%) 27.41 len=64, align1=2, align2=3, n=32: 22.63 ( 8.72%) 24.79 len=16, align1=6, align2=6, n=16: 14.76 ( 14.10%) 17.19 len=16, align1=6, align2=6, n=16: 14.57 ( 16.81%) 17.52 len=16, align1=6, align2=4, n=16: 14.88 ( 13.70%) 17.25 len=16, align1=4, align2=6, n=16: 16.29 ( -0.14%) 16.27 len=64, align1=2, align2=4, n=128: 28.40 ( 9.37%) 31.34 len=128, align1=4, align2=2, n=64: 28.48 ( 10.08%) 31.67 len=64, align1=2, align2=4, n=128: 29.65 ( 11.33%) 33.44 len=128, align1=4, align2=2, n=64: 30.18 ( 6.40%) 32.25 len=16, align1=7, align2=7, n=16: 14.86 ( 8.40%) 16.22 len=16, align1=7, align2=7, n=16: 13.78 ( 16.30%) 16.47 len=16, align1=7, align2=6, n=16: 14.23 ( 12.27%) 16.22 len=16, align1=6, align2=7, n=16: 16.30 ( -0.53%) 16.22 len=128, align1=1, align2=6, n=256: 35.07 ( 25.88%) 47.32 len=256, align1=6, align2=1, n=128: 45.32 ( 11.90%) 51.44 len=128, align1=1, align2=6, n=256: 35.14 ( 24.65%) 46.64 len=256, align1=6, align2=1, n=128: 43.26 ( 15.54%) 51.22 len=8, align1=0, align2=0, n=16: 13.17 ( 29.35%) 18.65 len=32, align1=0, align2=0, n=16: 18.81 ( -3.57%) 18.17 len=8, align1=7, align2=2, n=16: 13.92 ( -7.07%) 13.00 len=32, align1=7, align2=2, n=16: 17.52 ( 14.77%) 20.55 len=16, align1=0, align2=0, n=32: 13.85 ( 33.77%) 20.91 len=64, align1=0, align2=0, n=32: 23.32 ( 7.24%) 25.14 len=16, align1=6, align2=4, n=32: 14.87 ( 17.40%) 18.00 len=64, align1=6, align2=4, n=32: 23.32 ( 14.99%) 27.43 len=32, align1=0, align2=0, n=64: 21.05 ( 16.72%) 25.28 len=128, align1=0, align2=0, n=64: 28.81 ( 11.25%) 32.46 len=32, align1=5, align2=6, n=64: 24.68 ( 10.16%) 27.47 len=128, align1=5, align2=6, n=64: 28.66 ( 7.24%) 30.89 len=64, align1=0, align2=0, n=128: 24.98 ( 21.37%) 31.77 len=256, align1=0, align2=0, n=128: 43.90 ( 18.92%) 54.14 len=64, align1=4, align2=0, n=128: 26.13 ( 24.65%) 34.68 len=256, align1=4, align2=0, n=128: 44.27 ( 15.06%) 52.12 len=128, align1=0, align2=0, n=256: 34.29 ( 33.53%) 51.58 len=512, align1=0, align2=0, n=256: 68.94 ( 8.14%) 75.05 len=128, align1=3, align2=2, n=256: 36.06 ( 15.45%) 42.65 len=512, align1=3, align2=2, n=256: 65.15 ( 12.33%) 74.32 len=256, align1=0, align2=0, n=512: 46.37 ( 30.42%) 66.64 len=1024, align1=0, align2=0, n=512: 114.89 ( 8.32%) 125.31 len=256, align1=2, align2=4, n=512: 56.05 ( 16.50%) 67.12 len=1024, align1=2, align2=4, n=512: 179.87 (-52.13%) 118.24 len=512, align1=0, align2=0, n=1024: 68.16 ( 29.70%) 96.96 len=512, align1=1, align2=6, n=1024: 119.39 (-26.04%) 94.72 len=128, align1=1, align2=0, n=64: 27.46 ( 17.94%) 33.46 len=128, align1=0, align2=0, n=64: 29.69 ( -2.62%) 28.93 len=128, align1=0, align2=0, n=64: 27.25 ( 6.15%) 29.03 len=128, align1=0, align2=0, n=64: 27.24 ( 6.61%) 29.17 len=64, align1=1, align2=0, n=128: 25.50 ( 21.40%) 32.44 len=64, align1=0, align2=0, n=128: 23.50 ( 27.08%) 32.22 len=64, align1=0, align2=0, n=128: 24.88 ( 16.98%) 29.97 len=64, align1=0, align2=0, n=128: 24.59 ( 22.98%) 31.92 len=128, align1=1, align2=0, n=96: 27.46 ( 29.72%) 39.07 len=128, align1=0, align2=0, n=96: 28.55 ( 20.33%) 35.83 len=128, align1=0, align2=0, n=96: 27.25 ( 24.21%) 35.95 len=128, align1=0, align2=0, n=96: 28.53 ( 19.86%) 35.59 len=96, align1=1, align2=0, n=128: 30.65 ( 18.65%) 37.68 len=96, align1=0, align2=0, n=128: 28.06 ( 19.41%) 34.82 len=96, align1=0, align2=0, n=128: 27.92 ( 20.27%) 35.02 len=96, align1=0, align2=0, n=128: 28.06 ( 19.43%) 34.83 len=128, align1=1, align2=0, n=128: 31.31 ( 28.02%) 43.51 len=128, align1=0, align2=0, n=128: 28.52 ( 29.34%) 40.37 len=128, align1=0, align2=0, n=128: 27.25 ( 32.18%) 40.17 len=128, align1=0, align2=0, n=128: 27.46 ( 31.33%) 39.99 len=128, align1=1, align2=0, n=128: 31.32 ( 28.00%) 43.50 len=128, align1=0, align2=0, n=128: 27.46 ( 31.03%) 39.82 len=128, align1=0, align2=0, n=128: 27.25 ( 32.30%) 40.25 len=128, align1=0, align2=0, n=128: 27.25 ( 31.97%) 40.05 len=128, align1=1, align2=0, n=160: 34.00 ( 20.12%) 42.56 len=128, align1=0, align2=0, n=160: 32.19 ( 30.63%) 46.40 len=128, align1=0, align2=0, n=160: 32.17 ( 28.12%) 44.76 len=128, align1=0, align2=0, n=160: 32.39 ( 27.63%) 44.76 len=160, align1=1, align2=0, n=128: 29.84 ( 35.97%) 46.61 len=160, align1=0, align2=0, n=128: 31.79 ( 25.56%) 42.71 len=160, align1=0, align2=0, n=128: 32.00 ( 24.86%) 42.59 len=160, align1=0, align2=0, n=128: 31.79 ( 25.85%) 42.86 len=128, align1=1, align2=0, n=192: 33.81 ( 21.08%) 42.84 len=128, align1=0, align2=0, n=192: 32.38 ( 29.98%) 46.24 len=128, align1=0, align2=0, n=192: 32.38 ( 27.38%) 44.58 len=128, align1=0, align2=0, n=192: 32.18 ( 28.29%) 44.87 len=192, align1=1, align2=0, n=128: 34.71 ( 27.54%) 47.90 len=192, align1=0, align2=0, n=128: 35.25 ( 22.44%) 45.44 len=192, align1=0, align2=0, n=128: 35.30 ( 21.97%) 45.24 len=192, align1=0, align2=0, n=128: 35.03 ( 22.17%) 45.01 len=256, align1=1, align2=0, n=192: 39.58 ( 30.82%) 57.21 len=256, align1=0, align2=0, n=192: 42.27 ( 24.21%) 55.77 len=256, align1=0, align2=0, n=192: 41.10 ( 26.00%) 55.54 len=256, align1=0, align2=0, n=192: 43.11 ( 21.51%) 54.92 len=192, align1=1, align2=0, n=256: 38.15 ( 29.78%) 54.33 len=192, align1=0, align2=0, n=256: 37.43 ( 32.27%) 55.26 len=192, align1=0, align2=0, n=256: 37.43 ( 32.46%) 55.42 len=192, align1=0, align2=0, n=256: 37.43 ( 32.46%) 55.42 len=256, align1=1, align2=0, n=224: 40.87 ( 31.48%) 59.65 len=256, align1=0, align2=0, n=224: 41.66 ( 26.95%) 57.02 len=256, align1=0, align2=0, n=224: 41.08 ( 28.22%) 57.24 len=256, align1=0, align2=0, n=224: 41.17 ( 27.86%) 57.07 len=224, align1=1, align2=0, n=256: 38.96 ( 32.41%) 57.65 len=224, align1=0, align2=0, n=256: 42.27 ( 28.61%) 59.21 len=224, align1=0, align2=0, n=256: 40.15 ( 32.33%) 59.34 len=224, align1=0, align2=0, n=256: 40.10 ( 32.78%) 59.65 len=256, align1=1, align2=0, n=256: 41.22 ( 33.31%) 61.80 len=256, align1=0, align2=0, n=256: 41.52 ( 29.99%) 59.30 len=256, align1=0, align2=0, n=256: 41.17 ( 29.82%) 58.66 len=256, align1=0, align2=0, n=256: 41.18 ( 30.68%) 59.40 len=256, align1=1, align2=0, n=256: 47.52 ( 29.49%) 67.39 len=256, align1=0, align2=0, n=256: 44.83 ( 30.61%) 64.60 len=256, align1=0, align2=0, n=256: 45.50 ( 29.57%) 64.60 len=256, align1=0, align2=0, n=256: 44.83 ( 29.93%) 63.97 len=256, align1=1, align2=0, n=288: 44.21 ( 33.34%) 66.32 len=256, align1=0, align2=0, n=288: 41.58 ( 33.60%) 62.62 len=256, align1=0, align2=0, n=288: 44.57 ( 30.02%) 63.69 len=256, align1=0, align2=0, n=288: 42.80 ( 35.55%) 66.41 len=288, align1=1, align2=0, n=256: 46.39 ( 29.55%) 65.85 len=288, align1=0, align2=0, n=256: 45.95 ( 28.95%) 64.68 len=288, align1=0, align2=0, n=256: 46.26 ( 29.92%) 66.02 len=288, align1=0, align2=0, n=256: 48.47 ( 20.26%) 60.79 len=256, align1=1, align2=0, n=320: 41.81 ( 31.09%) 60.67 len=256, align1=0, align2=0, n=320: 41.87 ( 34.40%) 63.82 len=256, align1=0, align2=0, n=320: 41.52 ( 34.47%) 63.35 len=256, align1=0, align2=0, n=320: 44.29 ( 33.29%) 66.39 len=320, align1=1, align2=0, n=256: 48.70 ( 29.59%) 69.16 len=320, align1=0, align2=0, n=256: 46.47 ( 24.55%) 61.60 len=320, align1=0, align2=0, n=256: 45.68 ( 27.30%) 62.83 len=320, align1=0, align2=0, n=256: 47.34 ( 23.15%) 61.60 len=512, align1=1, align2=0, n=448: 72.59 ( 23.10%) 94.39 len=512, align1=0, align2=0, n=448: 68.84 ( 38.34%) 111.65 len=512, align1=0, align2=0, n=448: 69.80 ( 36.56%) 110.03 len=512, align1=0, align2=0, n=448: 67.31 ( 40.49%) 113.10 len=448, align1=1, align2=0, n=512: 65.75 ( 28.23%) 91.61 len=448, align1=0, align2=0, n=512: 61.41 ( 30.51%) 88.36 len=448, align1=0, align2=0, n=512: 65.19 ( 29.15%) 92.02 len=448, align1=0, align2=0, n=512: 61.07 ( 31.08%) 88.61 len=512, align1=1, align2=0, n=480: 75.89 ( 16.65%) 91.05 len=512, align1=0, align2=0, n=480: 66.17 ( 26.56%) 90.10 len=512, align1=0, align2=0, n=480: 65.74 ( 26.92%) 89.96 len=512, align1=0, align2=0, n=480: 66.30 ( 26.50%) 90.21 len=480, align1=1, align2=0, n=512: 65.24 ( 28.33%) 91.03 len=480, align1=0, align2=0, n=512: 64.50 ( 30.43%) 92.70 len=480, align1=0, align2=0, n=512: 64.49 ( 29.90%) 91.99 len=480, align1=0, align2=0, n=512: 64.50 ( 30.11%) 92.29 len=512, align1=1, align2=0, n=512: 68.43 ( 28.04%) 95.09 len=512, align1=0, align2=0, n=512: 67.02 ( 27.18%) 92.05 len=512, align1=0, align2=0, n=512: 67.02 ( 27.01%) 91.82 len=512, align1=0, align2=0, n=512: 67.02 ( 27.05%) 91.87 len=512, align1=1, align2=0, n=512: 67.68 ( 28.93%) 95.23 len=512, align1=0, align2=0, n=512: 67.03 ( 27.48%) 92.42 len=512, align1=0, align2=0, n=512: 67.02 ( 27.15%) 92.00 len=512, align1=0, align2=0, n=512: 67.02 ( 27.33%) 92.23 len=512, align1=1, align2=0, n=544: 70.63 ( 26.35%) 95.89 len=512, align1=0, align2=0, n=544: 67.72 ( 29.97%) 96.70 len=512, align1=0, align2=0, n=544: 67.71 ( 30.17%) 96.95 len=512, align1=0, align2=0, n=544: 67.71 ( 29.99%) 96.72 len=544, align1=1, align2=0, n=512: 83.22 ( 13.39%) 96.08 len=544, align1=0, align2=0, n=512: 68.97 ( 27.78%) 95.50 len=544, align1=0, align2=0, n=512: 71.83 ( 24.53%) 95.18 len=544, align1=0, align2=0, n=512: 68.99 ( 27.28%) 94.87 len=512, align1=1, align2=0, n=576: 72.60 ( 28.17%) 101.08 len=512, align1=0, align2=0, n=576: 72.27 ( 25.52%) 97.03 len=512, align1=0, align2=0, n=576: 67.75 ( 30.53%) 97.52 len=512, align1=0, align2=0, n=576: 72.53 ( 29.10%) 102.30 len=576, align1=1, align2=0, n=512: 82.05 ( 16.23%) 97.94 len=576, align1=0, align2=0, n=512: 71.35 ( 26.64%) 97.26 len=576, align1=0, align2=0, n=512: 74.36 ( 23.52%) 97.23 len=576, align1=0, align2=0, n=512: 71.58 ( 26.50%) 97.38 len=1024, align1=1, align2=0, n=960: 147.26 ( 11.02%) 165.50 len=1024, align1=0, align2=0, n=960: 134.00 ( 13.30%) 154.55 len=1024, align1=0, align2=0, n=960: 134.31 ( 13.26%) 154.84 len=1024, align1=0, align2=0, n=960: 134.53 ( 12.97%) 154.58 len=960, align1=1, align2=0, n=1024: 129.09 ( 20.84%) 163.08 len=960, align1=0, align2=0, n=1024: 113.32 ( 26.35%) 153.86 len=960, align1=0, align2=0, n=1024: 113.08 ( 26.77%) 154.42 len=960, align1=0, align2=0, n=1024: 113.10 ( 26.50%) 153.88 len=1024, align1=1, align2=0, n=992: 138.81 ( 18.75%) 170.85 len=1024, align1=0, align2=0, n=992: 134.08 ( 14.74%) 157.25 len=1024, align1=0, align2=0, n=992: 133.96 ( 14.83%) 157.28 len=1024, align1=0, align2=0, n=992: 133.76 ( 15.03%) 157.42 len=992, align1=1, align2=0, n=1024: 136.17 ( 18.21%) 166.50 len=992, align1=0, align2=0, n=1024: 116.81 ( 29.71%) 166.18 len=992, align1=0, align2=0, n=1024: 116.46 ( 26.72%) 158.92 len=992, align1=0, align2=0, n=1024: 116.63 ( 26.64%) 158.99 len=1024, align1=1, align2=0, n=1024: 150.63 ( 14.32%) 175.81 len=1024, align1=0, align2=0, n=1024: 119.07 ( 26.07%) 161.07 len=1024, align1=0, align2=0, n=1024: 119.10 ( 26.06%) 161.08 len=1024, align1=0, align2=0, n=1024: 118.91 ( 26.16%) 161.04 len=1024, align1=1, align2=0, n=1024: 158.94 ( 13.17%) 183.06 len=1024, align1=0, align2=0, n=1024: 120.68 ( 27.45%) 166.35 len=1024, align1=0, align2=0, n=1024: 119.16 ( 26.03%) 161.09 len=1024, align1=0, align2=0, n=1024: 119.16 ( 26.02%) 161.07 len=1024, align1=1, align2=0, n=1056: 162.90 ( 15.29%) 192.30 len=1024, align1=0, align2=0, n=1056: 140.90 ( 26.76%) 192.38 len=1024, align1=0, align2=0, n=1056: 140.05 ( 30.28%) 200.89 len=1024, align1=0, align2=0, n=1056: 146.22 ( 25.04%) 195.08 len=1056, align1=1, align2=0, n=1024: 166.62 ( 8.97%) 183.03 len=1056, align1=0, align2=0, n=1024: 121.48 ( 25.46%) 162.98 len=1056, align1=0, align2=0, n=1024: 123.93 ( 24.01%) 163.09 len=1056, align1=0, align2=0, n=1024: 127.86 ( 25.98%) 172.73 len=1024, align1=1, align2=0, n=1088: 167.49 ( 12.93%) 192.36 len=1024, align1=0, align2=0, n=1088: 147.48 ( 23.34%) 192.38 len=1024, align1=0, align2=0, n=1088: 140.01 ( 27.22%) 192.39 len=1024, align1=0, align2=0, n=1088: 140.09 ( 27.23%) 192.51 len=1088, align1=1, align2=0, n=1024: 159.00 ( 13.46%) 183.73 len=1088, align1=0, align2=0, n=1024: 143.31 ( 14.25%) 167.13 len=1088, align1=0, align2=0, n=1024: 140.46 ( 14.32%) 163.93 len=1088, align1=0, align2=0, n=1024: 139.85 ( 14.69%) 163.92 Function: strlcpy Variant: __strlcpy_avx2 __strlcpy_generic ======================================================================================================================== len=16, align1=1, align2=1, n=16: 11.11 ( 32.32%) 16.41 len=16, align1=1, align2=1, n=16: 10.73 ( 32.83%) 15.98 len=16, align1=1, align2=2, n=16: 10.53 ( 33.23%) 15.77 len=16, align1=2, align2=1, n=16: 10.89 ( 32.50%) 16.13 len=2, align1=7, align2=2, n=4: 8.06 ( 35.05%) 12.41 len=4, align1=2, align2=7, n=2: 8.66 ( 37.31%) 13.82 len=2, align1=7, align2=2, n=4: 7.78 ( 33.85%) 11.77 len=4, align1=2, align2=7, n=2: 8.70 ( 37.88%) 14.01 len=16, align1=2, align2=2, n=16: 10.43 ( 31.86%) 15.31 len=16, align1=2, align2=2, n=16: 10.87 ( 30.40%) 15.62 len=16, align1=2, align2=4, n=16: 10.47 ( 30.24%) 15.01 len=16, align1=4, align2=2, n=16: 10.56 ( 31.99%) 15.53 len=4, align1=6, align2=4, n=8: 11.33 ( 18.99%) 13.99 len=8, align1=4, align2=6, n=4: 10.44 ( 27.20%) 14.34 len=4, align1=6, align2=4, n=8: 11.43 ( 13.14%) 13.15 len=8, align1=4, align2=6, n=4: 10.83 ( 28.59%) 15.16 len=16, align1=3, align2=3, n=16: 10.39 ( 33.18%) 15.54 len=16, align1=3, align2=3, n=16: 10.13 ( 38.74%) 16.53 len=16, align1=3, align2=6, n=16: 10.29 ( 37.51%) 16.46 len=16, align1=6, align2=3, n=16: 10.56 ( 31.97%) 15.53 len=8, align1=5, align2=6, n=16: 10.48 ( 22.21%) 13.47 len=16, align1=6, align2=5, n=8: 10.95 ( 27.84%) 15.17 len=8, align1=5, align2=6, n=16: 10.55 ( 23.09%) 13.71 len=16, align1=6, align2=5, n=8: 10.98 ( 27.79%) 15.20 len=16, align1=4, align2=4, n=16: 10.39 ( 32.51%) 15.40 len=16, align1=4, align2=4, n=16: 10.38 ( 33.76%) 15.68 len=16, align1=4, align2=0, n=16: 10.57 ( 28.87%) 14.86 len=16, align1=0, align2=4, n=16: 10.28 ( 34.27%) 15.64 len=16, align1=4, align2=0, n=32: 10.59 ( 23.24%) 13.79 len=32, align1=0, align2=4, n=16: 11.66 ( 30.50%) 16.77 len=16, align1=4, align2=0, n=32: 10.67 ( 23.98%) 14.04 len=32, align1=0, align2=4, n=16: 11.06 ( 33.61%) 16.66 len=16, align1=5, align2=5, n=16: 10.43 ( 33.52%) 15.68 len=16, align1=5, align2=5, n=16: 10.49 ( 33.47%) 15.77 len=16, align1=5, align2=2, n=16: 10.54 ( 29.46%) 14.94 len=16, align1=2, align2=5, n=16: 10.20 ( 31.63%) 14.92 len=32, align1=3, align2=2, n=64: 13.88 ( 0.59%) 13.97 len=64, align1=2, align2=3, n=32: 11.72 ( 22.36%) 15.09 len=32, align1=3, align2=2, n=64: 13.49 ( 2.26%) 13.81 len=64, align1=2, align2=3, n=32: 11.54 ( 26.22%) 15.64 len=16, align1=6, align2=6, n=16: 10.39 ( 27.70%) 14.37 len=16, align1=6, align2=6, n=16: 9.94 ( 32.04%) 14.63 len=16, align1=6, align2=4, n=16: 9.91 ( 33.92%) 14.99 len=16, align1=4, align2=6, n=16: 10.19 ( 32.66%) 15.14 len=64, align1=2, align2=4, n=128: 14.66 ( 4.10%) 15.29 len=128, align1=4, align2=2, n=64: 18.22 (-17.01%) 15.57 len=64, align1=2, align2=4, n=128: 14.64 ( 3.89%) 15.24 len=128, align1=4, align2=2, n=64: 18.22 (-14.83%) 15.86 len=16, align1=7, align2=7, n=16: 9.86 ( 30.07%) 14.11 len=16, align1=7, align2=7, n=16: 9.86 ( 30.09%) 14.11 len=16, align1=7, align2=6, n=16: 9.93 ( 32.92%) 14.81 len=16, align1=6, align2=7, n=16: 9.83 ( 30.41%) 14.13 len=128, align1=1, align2=6, n=256: 22.24 ( 9.63%) 24.61 len=256, align1=6, align2=1, n=128: 20.91 ( 12.22%) 23.82 len=128, align1=1, align2=6, n=256: 22.21 ( 9.86%) 24.64 len=256, align1=6, align2=1, n=128: 20.81 ( 12.85%) 23.88 len=8, align1=0, align2=0, n=16: 10.33 ( 20.37%) 12.97 len=32, align1=0, align2=0, n=16: 10.75 ( 32.13%) 15.84 len=8, align1=7, align2=2, n=16: 10.38 ( 20.33%) 13.02 len=32, align1=7, align2=2, n=16: 11.03 ( 30.36%) 15.84 len=16, align1=0, align2=0, n=32: 9.98 ( 26.96%) 13.67 len=64, align1=0, align2=0, n=32: 10.94 ( 26.69%) 14.92 len=16, align1=6, align2=4, n=32: 10.07 ( 22.77%) 13.04 len=64, align1=6, align2=4, n=32: 11.68 ( 22.22%) 15.01 len=32, align1=0, align2=0, n=64: 11.15 ( 11.26%) 12.57 len=128, align1=0, align2=0, n=64: 17.59 ( -6.54%) 16.51 len=32, align1=5, align2=6, n=64: 12.56 ( 12.27%) 14.32 len=128, align1=5, align2=6, n=64: 19.12 (-20.33%) 15.89 len=64, align1=0, align2=0, n=128: 12.70 ( 17.81%) 15.45 len=256, align1=0, align2=0, n=128: 22.12 ( 7.72%) 23.97 len=64, align1=4, align2=0, n=128: 12.84 ( 18.75%) 15.81 len=256, align1=4, align2=0, n=128: 21.48 ( 12.33%) 24.50 len=128, align1=0, align2=0, n=256: 19.17 ( 3.24%) 19.81 len=512, align1=0, align2=0, n=256: 26.55 ( 3.43%) 27.49 len=128, align1=3, align2=2, n=256: 20.07 ( 17.46%) 24.32 len=512, align1=3, align2=2, n=256: 26.65 ( 17.61%) 32.35 len=256, align1=0, align2=0, n=512: 22.48 ( 14.46%) 26.28 len=1024, align1=0, align2=0, n=512: 39.85 ( 12.47%) 45.53 len=256, align1=2, align2=4, n=512: 27.00 ( 8.13%) 29.39 len=1024, align1=2, align2=4, n=512: 43.97 ( 15.73%) 52.18 len=512, align1=0, align2=0, n=1024: 32.09 ( 29.08%) 45.25 len=2048, align1=0, align2=0, n=1024: 65.11 ( 7.02%) 70.02 len=512, align1=1, align2=6, n=1024: 35.13 ( 26.54%) 47.83 len=2048, align1=1, align2=6, n=1024: 80.38 (-15.59%) 69.53 len=128, align1=1, align2=0, n=64: 18.89 (-12.93%) 16.72 len=128, align1=0, align2=0, n=64: 16.93 ( -9.06%) 15.52 len=128, align1=0, align2=0, n=64: 16.92 ( -8.70%) 15.57 len=128, align1=0, align2=0, n=64: 17.58 (-12.44%) 15.63 len=64, align1=1, align2=0, n=128: 12.84 ( 18.40%) 15.74 len=64, align1=0, align2=0, n=128: 12.64 ( 19.60%) 15.72 len=64, align1=0, align2=0, n=128: 12.78 ( 17.35%) 15.47 len=64, align1=0, align2=0, n=128: 12.65 ( 18.44%) 15.51 len=128, align1=1, align2=0, n=96: 20.15 ( -9.88%) 18.34 len=128, align1=0, align2=0, n=96: 18.21 ( -3.68%) 17.57 len=128, align1=0, align2=0, n=96: 18.46 ( -5.09%) 17.57 len=128, align1=0, align2=0, n=96: 18.86 ( 1.57%) 19.16 len=96, align1=1, align2=0, n=128: 13.99 ( 15.86%) 16.62 len=96, align1=0, align2=0, n=128: 14.60 ( 11.99%) 16.59 len=96, align1=0, align2=0, n=128: 14.38 ( 20.13%) 18.00 len=96, align1=0, align2=0, n=128: 14.34 ( 11.75%) 16.25 len=128, align1=1, align2=0, n=128: 19.53 ( -0.01%) 19.53 len=128, align1=0, align2=0, n=128: 20.17 ( -3.30%) 19.53 len=128, align1=0, align2=0, n=128: 20.18 (-14.72%) 17.59 len=128, align1=0, align2=0, n=128: 20.82 ( -0.68%) 20.68 len=128, align1=1, align2=0, n=128: 20.01 ( -5.92%) 18.89 len=128, align1=0, align2=0, n=128: 21.37 ( -8.22%) 19.74 len=128, align1=0, align2=0, n=128: 20.17 (-14.75%) 17.57 len=128, align1=0, align2=0, n=128: 20.80 (-18.42%) 17.57 len=128, align1=1, align2=0, n=160: 19.65 ( 15.99%) 23.39 len=128, align1=0, align2=0, n=160: 19.14 ( 3.36%) 19.80 len=128, align1=0, align2=0, n=160: 19.18 ( 3.40%) 19.85 len=128, align1=0, align2=0, n=160: 19.15 ( 3.36%) 19.81 len=160, align1=1, align2=0, n=128: 18.88 ( 12.02%) 21.46 len=160, align1=0, align2=0, n=128: 20.16 ( 9.62%) 22.31 len=160, align1=0, align2=0, n=128: 20.80 ( 0.05%) 20.81 len=160, align1=0, align2=0, n=128: 20.16 ( 8.81%) 22.11 len=128, align1=1, align2=0, n=192: 19.65 ( 16.12%) 23.42 len=128, align1=0, align2=0, n=192: 19.14 ( 3.37%) 19.80 len=128, align1=0, align2=0, n=192: 19.18 ( 3.16%) 19.80 len=128, align1=0, align2=0, n=192: 19.19 ( 3.06%) 19.80 len=192, align1=1, align2=0, n=128: 18.86 ( 19.40%) 23.40 len=192, align1=0, align2=0, n=128: 20.81 ( 6.46%) 22.24 len=192, align1=0, align2=0, n=128: 20.81 ( 8.70%) 22.79 len=192, align1=0, align2=0, n=128: 21.46 ( 4.55%) 22.48 len=256, align1=1, align2=0, n=192: 20.83 ( 13.49%) 24.08 len=256, align1=0, align2=0, n=192: 21.35 ( 15.83%) 25.37 len=256, align1=0, align2=0, n=192: 20.83 ( 15.85%) 24.75 len=256, align1=0, align2=0, n=192: 21.87 ( 13.82%) 25.37 len=192, align1=1, align2=0, n=256: 22.27 ( 5.03%) 23.45 len=192, align1=0, align2=0, n=256: 19.58 ( 14.91%) 23.02 len=192, align1=0, align2=0, n=256: 19.58 ( 14.91%) 23.01 len=192, align1=0, align2=0, n=256: 19.57 ( 16.70%) 23.50 len=256, align1=1, align2=0, n=224: 20.84 ( 19.02%) 25.74 len=256, align1=0, align2=0, n=224: 20.91 ( 15.73%) 24.81 len=256, align1=0, align2=0, n=224: 21.47 ( 10.79%) 24.07 len=256, align1=0, align2=0, n=224: 21.47 ( 10.79%) 24.06 len=224, align1=1, align2=0, n=256: 20.43 ( 16.38%) 24.43 len=224, align1=0, align2=0, n=256: 19.23 ( 16.62%) 23.06 len=224, align1=0, align2=0, n=256: 19.21 ( 16.84%) 23.10 len=224, align1=0, align2=0, n=256: 19.24 ( 16.77%) 23.12 len=256, align1=1, align2=0, n=256: 24.05 ( 5.44%) 25.44 len=256, align1=0, align2=0, n=256: 21.63 ( 14.98%) 25.45 len=256, align1=0, align2=0, n=256: 20.81 ( 13.64%) 24.10 len=256, align1=0, align2=0, n=256: 20.81 ( 13.67%) 24.10 len=256, align1=1, align2=0, n=256: 24.10 ( -0.20%) 24.05 len=256, align1=0, align2=0, n=256: 21.46 ( 16.56%) 25.71 len=256, align1=0, align2=0, n=256: 21.46 ( 10.79%) 24.05 len=256, align1=0, align2=0, n=256: 20.81 ( 14.64%) 24.38 len=256, align1=1, align2=0, n=288: 24.21 ( 15.45%) 28.63 len=256, align1=0, align2=0, n=288: 23.11 ( 12.68%) 26.46 len=256, align1=0, align2=0, n=288: 22.55 ( 14.25%) 26.29 len=256, align1=0, align2=0, n=288: 22.49 ( 14.49%) 26.30 len=288, align1=1, align2=0, n=256: 24.06 ( 5.36%) 25.42 len=288, align1=0, align2=0, n=256: 22.82 ( 7.35%) 24.63 len=288, align1=0, align2=0, n=256: 22.80 ( 10.98%) 25.62 len=288, align1=0, align2=0, n=256: 21.46 ( 17.56%) 26.03 len=256, align1=1, align2=0, n=320: 24.17 ( 15.82%) 28.71 len=256, align1=0, align2=0, n=320: 22.44 ( 14.79%) 26.34 len=256, align1=0, align2=0, n=320: 22.56 ( 14.14%) 26.27 len=256, align1=0, align2=0, n=320: 22.50 ( 14.35%) 26.27 len=320, align1=1, align2=0, n=256: 24.10 ( 8.33%) 26.29 len=320, align1=0, align2=0, n=256: 22.11 ( 16.28%) 26.41 len=320, align1=0, align2=0, n=256: 21.57 ( 16.27%) 25.76 len=320, align1=0, align2=0, n=256: 21.46 ( 15.42%) 25.37 len=512, align1=1, align2=0, n=448: 27.62 ( 31.43%) 40.28 len=512, align1=0, align2=0, n=448: 27.63 ( 32.11%) 40.70 len=512, align1=0, align2=0, n=448: 26.53 ( 35.05%) 40.85 len=512, align1=0, align2=0, n=448: 26.51 ( 34.99%) 40.78 len=448, align1=1, align2=0, n=512: 31.01 ( 28.08%) 43.11 len=448, align1=0, align2=0, n=512: 29.35 ( 36.94%) 46.54 len=448, align1=0, align2=0, n=512: 29.38 ( 37.01%) 46.63 len=448, align1=0, align2=0, n=512: 29.38 ( 37.01%) 46.64 len=512, align1=1, align2=0, n=480: 28.24 ( 35.42%) 43.73 len=512, align1=0, align2=0, n=480: 28.76 ( 28.65%) 40.31 len=512, align1=0, align2=0, n=480: 28.47 ( 30.82%) 41.16 len=512, align1=0, align2=0, n=480: 26.70 ( 31.68%) 39.08 len=480, align1=1, align2=0, n=512: 30.73 ( 26.75%) 41.95 len=480, align1=0, align2=0, n=512: 28.79 ( 34.92%) 44.23 len=480, align1=0, align2=0, n=512: 28.76 ( 35.89%) 44.87 len=480, align1=0, align2=0, n=512: 29.39 ( 35.67%) 45.68 len=512, align1=1, align2=0, n=512: 30.58 ( 25.28%) 40.92 len=512, align1=0, align2=0, n=512: 26.67 ( 31.41%) 38.87 len=512, align1=0, align2=0, n=512: 26.67 ( 34.15%) 40.50 len=512, align1=0, align2=0, n=512: 27.17 ( 30.43%) 39.06 len=512, align1=1, align2=0, n=512: 30.63 ( 25.12%) 40.91 len=512, align1=0, align2=0, n=512: 26.74 ( 31.56%) 39.06 len=512, align1=0, align2=0, n=512: 26.72 ( 31.55%) 39.04 len=512, align1=0, align2=0, n=512: 26.74 ( 31.11%) 38.81 len=512, align1=1, align2=0, n=544: 33.43 ( 21.70%) 42.69 len=512, align1=0, align2=0, n=544: 31.96 ( 27.77%) 44.25 len=512, align1=0, align2=0, n=544: 31.36 ( 27.40%) 43.20 len=512, align1=0, align2=0, n=544: 31.41 ( 27.14%) 43.11 len=544, align1=1, align2=0, n=512: 30.55 ( 25.76%) 41.15 len=544, align1=0, align2=0, n=512: 27.26 ( 31.01%) 39.51 len=544, align1=0, align2=0, n=512: 27.30 ( 30.74%) 39.41 len=544, align1=0, align2=0, n=512: 26.65 ( 32.38%) 39.40 len=512, align1=1, align2=0, n=576: 33.39 ( 21.56%) 42.58 len=512, align1=0, align2=0, n=576: 31.41 ( 28.37%) 43.85 len=512, align1=0, align2=0, n=576: 31.41 ( 27.57%) 43.37 len=512, align1=0, align2=0, n=576: 31.42 ( 27.41%) 43.28 len=576, align1=1, align2=0, n=512: 30.61 ( 27.75%) 42.36 len=576, align1=0, align2=0, n=512: 27.66 ( 31.54%) 40.40 len=576, align1=0, align2=0, n=512: 28.04 ( 30.84%) 40.55 len=576, align1=0, align2=0, n=512: 27.94 ( 31.15%) 40.58 len=1024, align1=1, align2=0, n=960: 39.78 ( 28.72%) 55.80 len=1024, align1=0, align2=0, n=960: 40.87 ( 26.15%) 55.34 len=1024, align1=0, align2=0, n=960: 40.06 ( 26.81%) 54.73 len=1024, align1=0, align2=0, n=960: 40.25 ( 26.40%) 54.69 len=960, align1=1, align2=0, n=1024: 38.74 ( 31.46%) 56.52 len=960, align1=0, align2=0, n=1024: 38.37 ( 36.30%) 60.24 len=960, align1=0, align2=0, n=1024: 38.37 ( 36.36%) 60.30 len=960, align1=0, align2=0, n=1024: 39.88 ( 35.25%) 61.60 len=1024, align1=1, align2=0, n=992: 39.71 ( 28.13%) 55.26 len=1024, align1=0, align2=0, n=992: 39.85 ( 29.39%) 56.44 len=1024, align1=0, align2=0, n=992: 40.34 ( 25.81%) 54.37 len=1024, align1=0, align2=0, n=992: 40.31 ( 25.91%) 54.40 len=992, align1=1, align2=0, n=1024: 37.72 ( 32.49%) 55.88 len=992, align1=0, align2=0, n=1024: 38.37 ( 36.02%) 59.97 len=992, align1=0, align2=0, n=1024: 38.42 ( 35.53%) 59.60 len=992, align1=0, align2=0, n=1024: 38.40 ( 35.67%) 59.69 len=1024, align1=1, align2=0, n=1024: 40.88 ( 26.02%) 55.26 len=1024, align1=0, align2=0, n=1024: 40.36 ( 25.56%) 54.22 len=1024, align1=0, align2=0, n=1024: 40.31 ( 25.60%) 54.19 len=1024, align1=0, align2=0, n=1024: 40.35 ( 29.70%) 57.40 len=1024, align1=1, align2=0, n=1024: 41.03 ( 25.71%) 55.22 len=1024, align1=0, align2=0, n=1024: 40.37 ( 25.42%) 54.13 len=1024, align1=0, align2=0, n=1024: 40.31 ( 25.64%) 54.21 len=1024, align1=0, align2=0, n=1024: 40.32 ( 25.60%) 54.19 len=1024, align1=1, align2=0, n=1056: 41.06 ( 25.94%) 55.45 len=1024, align1=0, align2=0, n=1056: 41.06 ( 29.54%) 58.27 len=1024, align1=0, align2=0, n=1056: 41.05 ( 28.94%) 57.77 len=1024, align1=0, align2=0, n=1056: 41.02 ( 28.82%) 57.62 len=1056, align1=1, align2=0, n=1024: 41.00 ( 26.23%) 55.59 len=1056, align1=0, align2=0, n=1024: 39.67 ( 27.07%) 54.39 len=1056, align1=0, align2=0, n=1024: 40.34 ( 29.19%) 56.97 len=1056, align1=0, align2=0, n=1024: 40.37 ( 27.52%) 55.71 len=1024, align1=1, align2=0, n=1088: 41.02 ( 26.33%) 55.68 len=1024, align1=0, align2=0, n=1088: 41.06 ( 30.82%) 59.35 len=1024, align1=0, align2=0, n=1088: 41.05 ( 29.58%) 58.29 len=1024, align1=0, align2=0, n=1088: 41.14 ( 28.69%) 57.69 len=1088, align1=1, align2=0, n=1024: 41.31 ( 27.50%) 56.98 len=1088, align1=0, align2=0, n=1024: 40.32 ( 29.25%) 56.99 len=1088, align1=0, align2=0, n=1024: 40.74 ( 27.82%) 56.44 len=1088, align1=0, align2=0, n=1024: 40.70 ( 26.62%) 55.47
On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: >> >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: >> > Think we should at the very least wait for the generic strlcpy codes >> > to land first. >> >> Let's not optimize these functions at all, unless there's good and >> measured reason to do so. In practice I expected they're called with >> small sizes for which optimization is a net minus as it consumes >> valuable maintenance time with no real benefit. > > > Hi Paul, > > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. > I don't think the concern is that we can beat the generic impl (which hasn't even landed yet AFAICT), it whether doing so makes sense given the usage/goal of the functions. > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html > > Thanks, > Sunil
On Fri, Jun 30, 2023 at 6:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote: > >> > >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: > >> > Think we should at the very least wait for the generic strlcpy codes > >> > to land first. > >> > >> Let's not optimize these functions at all, unless there's good and > >> measured reason to do so. In practice I expected they're called with > >> small sizes for which optimization is a net minus as it consumes > >> valuable maintenance time with no real benefit. > > > > > > Hi Paul, > > > > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch. > > > I don't think the concern is that we can beat the generic impl (which hasn't > even landed yet AFAICT), it whether doing so makes sense given the > usage/goal of the functions. > That being said, I'm generally in favor of adding optimized versions since we happen to be a position where at least several developers find it worth their time to maintain, but not before the generic versions have landed. > > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html > > > > Thanks, > > Sunil
* Noah Goldstein via Libc-alpha: > Think we should at the very least wait for the generic strlcpy codes > to land first. Do you mean a version of string/strlcpy.c that is based on a modified string/stplcpy.c, rather than the one we have now that calls just strlen and memcpy? Thanks, Florian
On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: > > * Noah Goldstein via Libc-alpha: > > > Think we should at the very least wait for the generic strlcpy codes > > to land first. > > Do you mean a version of string/strlcpy.c that is based on a modified > string/stplcpy.c, rather than the one we have now that calls just strlen > and memcpy? Hmm? I mean your strlcpy/strlcat patch to land. > > Thanks, > Florian >
* Noah Goldstein: > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: >> >> * Noah Goldstein via Libc-alpha: >> >> > Think we should at the very least wait for the generic strlcpy codes >> > to land first. >> >> Do you mean a version of string/strlcpy.c that is based on a modified >> string/stplcpy.c, rather than the one we have now that calls just strlen >> and memcpy? > > Hmm? I mean your strlcpy/strlcat patch to land. That has already happened? Thanks, Florian
On Sun, Jul 2, 2023 at 1:51 AM Florian Weimer <fweimer@redhat.com> wrote: > > * Noah Goldstein: > > > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote: > >> > >> * Noah Goldstein via Libc-alpha: > >> > >> > Think we should at the very least wait for the generic strlcpy codes > >> > to land first. > >> > >> Do you mean a version of string/strlcpy.c that is based on a modified > >> string/stplcpy.c, rather than the one we have now that calls just strlen > >> and memcpy? > > > > Hmm? I mean your strlcpy/strlcat patch to land. > > That has already happened? :/ yup had been a minute since I pulled. Are we getting stplcpy? > > Thanks, > Florian >
* Noah Goldstein: >> >> Do you mean a version of string/strlcpy.c that is based on a modified >> >> string/stplcpy.c, rather than the one we have now that calls just strlen >> >> and memcpy? >> > >> > Hmm? I mean your strlcpy/strlcat patch to land. >> >> That has already happened? > :/ yup had been a minute since I pulled. > > Are we getting stplcpy? No. I mentioned string/stplcpy.c because it's what the generic strcpy is based upon. Sorry for the confusion. Thanks, Florian
On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha <libc-alpha@sourceware.org> wrote: > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > --- > sysdeps/x86_64/multiarch/Makefile | 4 + > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > 9 files changed, 627 insertions(+) > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e1e894c963..7e3fc081df 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -82,6 +82,8 @@ sysdep_routines += \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > strcspn-sse4 \ > + strlcpy-avx2 \ > + strlcpy-generic \ > strlen-avx2 \ > strlen-avx2-rtm \ > strlen-evex \ > @@ -153,6 +155,8 @@ sysdep_routines += \ > wcscpy-evex \ > wcscpy-generic \ > wcscpy-ssse3 \ > + wcslcpy-avx2 \ > + wcslcpy-generic \ > wcslen-avx2 \ > wcslen-avx2-rtm \ > wcslen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 5427ff1907..9928dee187 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __strncat_sse2_unaligned)) > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > + IFUNC_IMPL (i, name, strlcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > + CPU_FEATURE_USABLE (AVX2), > + __strlcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > + 1, > + __strlcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > IFUNC_IMPL (i, name, strncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > 1, > __wcscpy_generic)) > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > + IFUNC_IMPL (i, name, wcslcpy, > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > + CPU_FEATURE_USABLE (AVX2), > + __wcslcpy_avx2) > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > + 1, > + __wcslcpy_generic)) > + > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > IFUNC_IMPL (i, name, wcsncpy, > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > new file mode 100644 > index 0000000000..982a30d15b > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > @@ -0,0 +1,34 @@ > +/* Common definition for ifunc selections. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <init-arch.h> > + > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > + > +static inline void * > +IFUNC_SELECTOR (void) > +{ > + const struct cpu_features *cpu_features = __get_cpu_features (); > + > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > + return OPTIMIZE (avx2); > + > + return OPTIMIZE (generic); > +} > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > new file mode 100644 > index 0000000000..cf54b1e990 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > @@ -0,0 +1,446 @@ > +/* Strlcpy/wcslcpy optimized with AVX2. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRLCPY > +# define STRLCPY __strlcpy_avx2 > +# endif > + > + > +# ifdef USE_AS_WCSLCPY > +# define CHAR_SIZE 4 > +# define MOVU movl > +# define VPCMPEQ vpcmpeqd > +# define VPMINU vpminud > +# else > +# define CHAR_SIZE 1 > +# define MOVU movb > +# define VPCMPEQ vpcmpeqb > +# define VPMINU vpminub > +# endif > + > +# define PMOVMSK vpmovmskb > +# define PAGE_SIZE 4096 > +# define VEC_SIZE 32 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text),"ax",@progbits > +/* Aligning entry point to 64 byte, provides better performance for > + one vector length string. */ > + > +ENTRY_P2ALIGN (STRLCPY, 6) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %edx, %edx > +# endif > + > + /* Zero out vector register for end of string comparison. */ > + vpxor %VMM(0), %VMM(0), %VMM(0) > + /* Save source pointer for return calculation. */ > + mov %rsi, %r8 > + mov %esi, %eax > + sall $20, %eax > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + /* Load first vector. */ > + VMOVU (%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + test %eax, %eax > + jnz L(ret_vec_x1) > + > + test %rdx, %rdx > + jz L(continue_second_vector) > + > + /* Check whether we can copy full vector. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(page_cross_small_vec_copy) > + /* Copy first vector. */ > + VMOVU %VMM(1), (%rdi) > + sub $CHAR_PER_VEC, %rdx > + > +L(continue_second_vector): > + /* Align RSI pointer and adjust RDI based on offset. */ > + mov %rsi, %rax > + and $-VEC_SIZE, %rsi > + sub %rsi, %rax > + sub %rax, %rdi > + > + /* Check if string already copied N char, and RDX is 0. */ > + test %rdx, %rdx > + jz L(skip_copy_alignment_fix) > + > + /* Adjust RDX for copy alignment fix. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + add %rax, %rdx > + > +L(skip_copy_alignment_fix): > + /* Load second vector. */ > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x2) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_third_vector) > + > + /* Jump below/equal(instead of below) used here, because last > + copy chracter must be NULL. */ > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_second_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy second vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + > +L(continue_third_vector): > + /* Load third vector. */ > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x3) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(continue_fourth_vector) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_third_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy third vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > + > +L(continue_fourth_vector): > + /* Load fourth vector. */ > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + vptest %VMM(2), %VMM(2) > + jnz L(ret_vec_x4) > + > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_align) > + > + cmp $CHAR_PER_VEC, %rdx > + jbe L(partial_copy_fourth_vector) > + > + sub $CHAR_PER_VEC, %rdx > + /* Copy fourth vector. */ > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > + > + > +L(loop_4x_align): > + /* Jump to loop if RSI is already 4 vector align. */ > + test $(VEC_SIZE * 4 - 1), %esi > + jz L(loop_4x_read) > + > + mov %rsi, %rcx > + > + /* Align RSI to 4x vector. */ > + and $(VEC_SIZE * -4), %rsi > + sub %rsi, %rcx > + > + /* Adjust RDI for RSI alignment fix. */ > + sub %rcx, %rdi > + > + /* Jump to loop if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_4x_read) > + > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + > + /* Adjust RDX for RSI alignment fix. */ > + add %rcx, %rdx > + jmp L(loop_4x_read) > + > + .p2align 4,,6 > +L(loop_4x_vec): > + /* Skip copy if RDX is 0. */ > + test %rdx, %rdx > + jz L(loop_partial_copy_return) > + cmp $(CHAR_PER_VEC * 4), %rdx > + jbe L(loop_partial_copy) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > + sub $(CHAR_PER_VEC * 4), %rdx > + > +L(loop_partial_copy_return): > + sub $(VEC_SIZE * -4), %rsi > + sub $(VEC_SIZE * -4), %rdi > + > +L(loop_4x_read): > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > + VPMINU %VMM(1), %VMM(2), %VMM(5) > + VPMINU %VMM(3), %VMM(4), %VMM(6) > + VPMINU %VMM(5), %VMM(6), %VMM(7) > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > + vptest %VMM(7), %VMM(7) > + > + jz L(loop_4x_vec) > + > + /* Check if string ends in first vector or second vector. */ > + lea (VEC_SIZE * 4)(%rsi), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > +# endif > + xor %r10, %r10 > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > + vptest %VMM(6), %VMM(6) > + jnz L(endloop) > + sub $(CHAR_PER_VEC * -2), %rax > + mov $(CHAR_PER_VEC * 2), %r10 > + VMOVA %VMM(3), %VMM(1) > + VMOVA %VMM(4), %VMM(2) > + > +L(endloop): > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > + PMOVMSK %VMM(1), %rcx > + PMOVMSK %VMM(2), %r9 > + shlq $32, %r9 > + orq %r9, %rcx > + bsf %rcx, %rcx > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > +# ifdef USE_AS_WCSLCPY > + shr $2, %rcx > +# endif > + /* At this point RAX has length to return. */ > + add %rcx, %rax > + test %rdx, %rdx > + jz L(ret) > + > + /* Add 1 to account for NULL character in RDX comparison. */ > + lea 1(%r10, %rcx), %rcx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(loop_partial_copy): > + cmp $(CHAR_PER_VEC * 2), %rdx > + jbe L(loop_partial_first_half) > + /* Reload first 2 vector. */ > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > + > +L(loop_partial_first_half): > + /* Go back 2 vector from last and use overlapping copy. > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > + */ > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(7), %VMM(7) > + jz L(loop_partial_copy_return) > + ret > + > + .p2align 4 > +L(page_cross): > + mov %rsi, %rcx > + mov %rsi, %r11 > + and $-VEC_SIZE, %r11 > + and $(VEC_SIZE - 1), %rcx > + VMOVA (%r11), %VMM(1) > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > + PMOVMSK %VMM(2), %eax > + shr %cl, %eax > + jz L(page_cross_continue) > + > +L(ret_vec_x1): > + bsf %eax, %eax > +# ifdef USE_AS_WCSLCPY > + shr $2, %eax > +# endif > + /* Increment by 1 to account for NULL char. */ > + lea 1(%eax), %ecx > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + test %rdx, %rdx > + jz L(ret) > + > +L(page_cross_small_vec_copy): > + cmp $(16 / CHAR_SIZE), %rdx > + jbe L(copy_8_byte_scalar) > + VMOVU (%rsi), %VMM_128(1) > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > + VMOVU %VMM_128(1), (%rdi) > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %rdx, %rdx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_8_byte_scalar): > + cmp $(8 / CHAR_SIZE), %rdx > + jbe L(copy_4_byte_scalar) > + movq (%rsi), %r10 > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > + movq %r10, (%rdi) > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_4_byte_scalar): > +# ifndef USE_AS_WCSLCPY > + cmp $4, %rdx > + jbe L(copy_2_byte_scalar) > +# endif > + movl (%rsi), %r10d > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > + movl %r10d, (%rdi) > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +# ifndef USE_AS_WCSLCPY > +L(copy_2_byte_scalar): > + cmp $2, %rdx > + jbe L(copy_1_byte_scalar) > + movw (%rsi), %r10w > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > + movw %r10w, (%rdi) > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > + > +L(copy_1_byte_scalar): > + MOVU (%rsi), %r10b > + MOVU %r10b, (%rdi) > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_second_vector) > + ret > +# endif > + > +L(ret_vec_x2): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea VEC_SIZE(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_second_vector): > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_third_vector) > + > +L(ret): > + ret > + > +L(ret_vec_x3): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_third_vector): > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +L(ret_vec_x4): > + PMOVMSK %VMM(2), %rax > + bsf %rax, %rcx > + /* Calculate return value. */ > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > + sub %r8, %rax > +# ifdef USE_AS_WCSLCPY > + shr $2, %rax > + shr $2, %rcx > +# endif > + inc %rcx > + test %rdx, %rdx > + jz L(ret) > + cmp %rdx, %rcx > + cmovb %rcx, %rdx > + > +L(partial_copy_fourth_vector): > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > + xor %edx, %edx > + vptest %VMM(2), %VMM(2) > + jz L(continue_fourth_vector) > + ret > + > +END (STRLCPY) Is strlcpy/strlcat integratable with existing strncat impl? Had figured they would fit in the same file. > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c > new file mode 100644 > index 0000000000..eee3b7b086 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > @@ -0,0 +1,25 @@ > +/* strlcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __strlcpy __strlcpy_generic > +# include <string/strlcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c > new file mode 100644 > index 0000000000..ded41fbcfb > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > @@ -0,0 +1,36 @@ > +/* Multiple versions of strlcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __strlcpy __redirect_strlcpy > +# include <string.h> > +# undef __strlcpy > + > +# define SYMBOL_NAME strlcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); > +weak_alias (__strlcpy, strlcpy) > + > +# ifdef SHARED > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > +# endif > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > new file mode 100644 > index 0000000000..dafc20ded0 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > @@ -0,0 +1,4 @@ > +#define STRLCPY __wcslcpy_avx2 > +#define USE_AS_WCSLCPY 1 > + > +#include "strlcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > new file mode 100644 > index 0000000000..ffd3c0e846 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > @@ -0,0 +1,25 @@ > +/* wcslcpy generic. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +#include <isa-level.h> > +#if ISA_SHOULD_BUILD (1) > +# define __wcslcpy __wcslcpy_generic > +# include <wcsmbs/wcslcpy.c> > + > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c > new file mode 100644 > index 0000000000..371ef9626c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > @@ -0,0 +1,35 @@ > +/* Multiple versions of wcslcpy. > + All versions must be listed in ifunc-impl-list.c. > + Copyright (C) 2023 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +/* Define multiple versions only for the definition in libc. */ > +#if IS_IN (libc) > +# define __wcslcpy __redirect_wcslcpy > +# include <wchar.h> > +# undef __wcslcpy > + > +# define SYMBOL_NAME wcslcpy > +# include "ifunc-strlcpy.h" > + > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); > +weak_alias (__wcslcpy, wcslcpy) > +# ifdef SHARED > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > +# endif > +#endif > -- > 2.38.1 >
On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha > <libc-alpha@sourceware.org> wrote: > > > > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > > --- > > sysdeps/x86_64/multiarch/Makefile | 4 + > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ > > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > > 9 files changed, 627 insertions(+) > > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > > index e1e894c963..7e3fc081df 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -82,6 +82,8 @@ sysdep_routines += \ > > strcpy-sse2 \ > > strcpy-sse2-unaligned \ > > strcspn-sse4 \ > > + strlcpy-avx2 \ > > + strlcpy-generic \ > > strlen-avx2 \ > > strlen-avx2-rtm \ > > strlen-evex \ > > @@ -153,6 +155,8 @@ sysdep_routines += \ > > wcscpy-evex \ > > wcscpy-generic \ > > wcscpy-ssse3 \ > > + wcslcpy-avx2 \ > > + wcslcpy-generic \ > > wcslen-avx2 \ > > wcslen-avx2-rtm \ > > wcslen-evex \ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 5427ff1907..9928dee187 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > > 1, > > __strncat_sse2_unaligned)) > > > > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > > + IFUNC_IMPL (i, name, strlcpy, > > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > > + CPU_FEATURE_USABLE (AVX2), > > + __strlcpy_avx2) > > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > > + 1, > > + __strlcpy_generic)) > > + > > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > > IFUNC_IMPL (i, name, strncpy, > > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > > 1, > > __wcscpy_generic)) > > > > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > > + IFUNC_IMPL (i, name, wcslcpy, > > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > > + CPU_FEATURE_USABLE (AVX2), > > + __wcslcpy_avx2) > > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > > + 1, > > + __wcslcpy_generic)) > > + > > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > > IFUNC_IMPL (i, name, wcsncpy, > > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > new file mode 100644 > > index 0000000000..982a30d15b > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > > @@ -0,0 +1,34 @@ > > +/* Common definition for ifunc selections. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <init-arch.h> > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > > + > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features *cpu_features = __get_cpu_features (); > > + > > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > > + return OPTIMIZE (avx2); > > + > > + return OPTIMIZE (generic); > > +} > > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S > b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > > new file mode 100644 > > index 0000000000..cf54b1e990 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > > @@ -0,0 +1,446 @@ > > +/* Strlcpy/wcslcpy optimized with AVX2. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (3) > > + > > +# include <sysdep.h> > > + > > +# ifndef VEC_SIZE > > +# include "x86-avx-vecs.h" > > +# endif > > + > > +# ifndef STRLCPY > > +# define STRLCPY __strlcpy_avx2 > > +# endif > > + > > + > > +# ifdef USE_AS_WCSLCPY > > +# define CHAR_SIZE 4 > > +# define MOVU movl > > +# define VPCMPEQ vpcmpeqd > > +# define VPMINU vpminud > > +# else > > +# define CHAR_SIZE 1 > > +# define MOVU movb > > +# define VPCMPEQ vpcmpeqb > > +# define VPMINU vpminub > > +# endif > > + > > +# define PMOVMSK vpmovmskb > > +# define PAGE_SIZE 4096 > > +# define VEC_SIZE 32 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > + .section SECTION(.text),"ax",@progbits > > +/* Aligning entry point to 64 byte, provides better performance for > > + one vector length string. */ > > + > > +ENTRY_P2ALIGN (STRLCPY, 6) > > +# ifdef __ILP32__ > > + /* Clear the upper 32 bits. */ > > + movl %edx, %edx > > +# endif > > + > > + /* Zero out vector register for end of string comparison. */ > > + vpxor %VMM(0), %VMM(0), %VMM(0) > > + /* Save source pointer for return calculation. */ > > + mov %rsi, %r8 > > + mov %esi, %eax > > + sall $20, %eax > > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > > + ja L(page_cross) > > + > > +L(page_cross_continue): > > + /* Load first vector. */ > > + VMOVU (%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + PMOVMSK %VMM(2), %eax > > + test %eax, %eax > > + jnz L(ret_vec_x1) > > + > > + test %rdx, %rdx > > + jz L(continue_second_vector) > > + > > + /* Check whether we can copy full vector. */ > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(page_cross_small_vec_copy) > > + /* Copy first vector. */ > > + VMOVU %VMM(1), (%rdi) > > + sub $CHAR_PER_VEC, %rdx > > + > > +L(continue_second_vector): > > + /* Align RSI pointer and adjust RDI based on offset. */ > > + mov %rsi, %rax > > + and $-VEC_SIZE, %rsi > > + sub %rsi, %rax > > + sub %rax, %rdi > > + > > + /* Check if string already copied N char, and RDX is 0. */ > > + test %rdx, %rdx > > + jz L(skip_copy_alignment_fix) > > + > > + /* Adjust RDX for copy alignment fix. */ > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > +# endif > > + add %rax, %rdx > > + > > +L(skip_copy_alignment_fix): > > + /* Load second vector. */ > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x2) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(continue_third_vector) > > + > > + /* Jump below/equal(instead of below) used here, because last > > + copy chracter must be NULL. */ > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_second_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy second vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + > > +L(continue_third_vector): > > + /* Load third vector. */ > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x3) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(continue_fourth_vector) > > + > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_third_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy third vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > > + > > +L(continue_fourth_vector): > > + /* Load fourth vector. */ > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + vptest %VMM(2), %VMM(2) > > + jnz L(ret_vec_x4) > > + > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_4x_align) > > + > > + cmp $CHAR_PER_VEC, %rdx > > + jbe L(partial_copy_fourth_vector) > > + > > + sub $CHAR_PER_VEC, %rdx > > + /* Copy fourth vector. */ > > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > > + > > + > > +L(loop_4x_align): > > + /* Jump to loop if RSI is already 4 vector align. */ > > + test $(VEC_SIZE * 4 - 1), %esi > > + jz L(loop_4x_read) > > + > > + mov %rsi, %rcx > > + > > + /* Align RSI to 4x vector. */ > > + and $(VEC_SIZE * -4), %rsi > > + sub %rsi, %rcx > > + > > + /* Adjust RDI for RSI alignment fix. */ > > + sub %rcx, %rdi > > + > > + /* Jump to loop if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_4x_read) > > + > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rcx > > +# endif > > + > > + /* Adjust RDX for RSI alignment fix. */ > > + add %rcx, %rdx > > + jmp L(loop_4x_read) > > + > > + .p2align 4,,6 > > +L(loop_4x_vec): > > + /* Skip copy if RDX is 0. */ > > + test %rdx, %rdx > > + jz L(loop_partial_copy_return) > > + cmp $(CHAR_PER_VEC * 4), %rdx > > + jbe L(loop_partial_copy) > > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > > + sub $(CHAR_PER_VEC * 4), %rdx > > + > > +L(loop_partial_copy_return): > > + sub $(VEC_SIZE * -4), %rsi > > + sub $(VEC_SIZE * -4), %rdi > > + > > +L(loop_4x_read): > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > > + VPMINU %VMM(1), %VMM(2), %VMM(5) > > + VPMINU %VMM(3), %VMM(4), %VMM(6) > > + VPMINU %VMM(5), %VMM(6), %VMM(7) > > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > > + vptest %VMM(7), %VMM(7) > > + > > + jz L(loop_4x_vec) > > + > > + /* Check if string ends in first vector or second vector. */ > > + lea (VEC_SIZE * 4)(%rsi), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > +# endif > > + xor %r10, %r10 > > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > > + vptest %VMM(6), %VMM(6) > > + jnz L(endloop) > > + sub $(CHAR_PER_VEC * -2), %rax > > + mov $(CHAR_PER_VEC * 2), %r10 > > + VMOVA %VMM(3), %VMM(1) > > + VMOVA %VMM(4), %VMM(2) > > + > > +L(endloop): > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > > + PMOVMSK %VMM(1), %rcx > > + PMOVMSK %VMM(2), %r9 > > + shlq $32, %r9 > > + orq %r9, %rcx > > + bsf %rcx, %rcx > > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rcx > > +# endif > > + /* At this point RAX has length to return. */ > > + add %rcx, %rax > > + test %rdx, %rdx > > + jz L(ret) > > + > > + /* Add 1 to account for NULL character in RDX comparison. */ > > + lea 1(%r10, %rcx), %rcx > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(loop_partial_copy): > > + cmp $(CHAR_PER_VEC * 2), %rdx > > + jbe L(loop_partial_first_half) > > + /* Reload first 2 vector. */ > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > > + > > +L(loop_partial_first_half): > > + /* Go back 2 vector from last and use overlapping copy. > > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > > + */ > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + xor %rdx, %rdx > > + vptest %VMM(7), %VMM(7) > > + jz L(loop_partial_copy_return) > > + ret > > + > > + .p2align 4 > > +L(page_cross): > > + mov %rsi, %rcx > > + mov %rsi, %r11 > > + and $-VEC_SIZE, %r11 > > + and $(VEC_SIZE - 1), %rcx > > + VMOVA (%r11), %VMM(1) > > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > > + PMOVMSK %VMM(2), %eax > > + shr %cl, %eax > > + jz L(page_cross_continue) > > + > > +L(ret_vec_x1): > > + bsf %eax, %eax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %eax > > +# endif > > + /* Increment by 1 to account for NULL char. */ > > + lea 1(%eax), %ecx > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + test %rdx, %rdx > > + jz L(ret) > > + > > +L(page_cross_small_vec_copy): > > + cmp $(16 / CHAR_SIZE), %rdx > > + jbe L(copy_8_byte_scalar) > > + VMOVU (%rsi), %VMM_128(1) > > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > > + VMOVU %VMM_128(1), (%rdi) > > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %rdx, %rdx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_8_byte_scalar): > > + cmp $(8 / CHAR_SIZE), %rdx > > + jbe L(copy_4_byte_scalar) > > + movq (%rsi), %r10 > > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > > + movq %r10, (%rdi) > > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_4_byte_scalar): > > +# ifndef USE_AS_WCSLCPY > > + cmp $4, %rdx > > + jbe L(copy_2_byte_scalar) > > +# endif > > + movl (%rsi), %r10d > > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > > + movl %r10d, (%rdi) > > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +# ifndef USE_AS_WCSLCPY > > +L(copy_2_byte_scalar): > > + cmp $2, %rdx > > + jbe L(copy_1_byte_scalar) > > + movw (%rsi), %r10w > > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > > + movw %r10w, (%rdi) > > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > + > > +L(copy_1_byte_scalar): > > + MOVU (%rsi), %r10b > > + MOVU %r10b, (%rdi) > > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_second_vector) > > + ret > > +# endif > > + > > +L(ret_vec_x2): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea VEC_SIZE(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_second_vector): > > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_third_vector) > > + > > +L(ret): > > + ret > > + > > +L(ret_vec_x3): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_third_vector): > > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_fourth_vector) > > + ret > > + > > +L(ret_vec_x4): > > + PMOVMSK %VMM(2), %rax > > + bsf %rax, %rcx > > + /* Calculate return value. */ > > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > > + sub %r8, %rax > > +# ifdef USE_AS_WCSLCPY > > + shr $2, %rax > > + shr $2, %rcx > > +# endif > > + inc %rcx > > + test %rdx, %rdx > > + jz L(ret) > > + cmp %rdx, %rcx > > + cmovb %rcx, %rdx > > + > > +L(partial_copy_fourth_vector): > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > > + xor %edx, %edx > > + vptest %VMM(2), %VMM(2) > > + jz L(continue_fourth_vector) > > + ret > > + > > +END (STRLCPY) > > Is strlcpy/strlcat integratable with existing strncat impl? Had > figured they would > fit in the same file. > Hi Noah, It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. --Sunil > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c > b/sysdeps/x86_64/multiarch/strlcpy-generic.c > > new file mode 100644 > > index 0000000000..eee3b7b086 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > > @@ -0,0 +1,25 @@ > > +/* strlcpy generic. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +#include <isa-level.h> > > +#if ISA_SHOULD_BUILD (1) > > +# define __strlcpy __strlcpy_generic > > +# include <string/strlcpy.c> > > + > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c > b/sysdeps/x86_64/multiarch/strlcpy.c > > new file mode 100644 > > index 0000000000..ded41fbcfb > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > > @@ -0,0 +1,36 @@ > > +/* Multiple versions of strlcpy. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __strlcpy __redirect_strlcpy > > +# include <string.h> > > +# undef __strlcpy > > + > > +# define SYMBOL_NAME strlcpy > > +# include "ifunc-strlcpy.h" > > + > > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR > ()); > > +weak_alias (__strlcpy, strlcpy) > > + > > +# ifdef SHARED > > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); > > +# endif > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > new file mode 100644 > > index 0000000000..dafc20ded0 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > > @@ -0,0 +1,4 @@ > > +#define STRLCPY __wcslcpy_avx2 > > +#define USE_AS_WCSLCPY 1 > > + > > +#include "strlcpy-avx2.S" > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c > b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > > new file mode 100644 > > index 0000000000..ffd3c0e846 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > > @@ -0,0 +1,25 @@ > > +/* wcslcpy generic. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +#include <isa-level.h> > > +#if ISA_SHOULD_BUILD (1) > > +# define __wcslcpy __wcslcpy_generic > > +# include <wcsmbs/wcslcpy.c> > > + > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c > b/sysdeps/x86_64/multiarch/wcslcpy.c > > new file mode 100644 > > index 0000000000..371ef9626c > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > > @@ -0,0 +1,35 @@ > > +/* Multiple versions of wcslcpy. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2023 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __wcslcpy __redirect_wcslcpy > > +# include <wchar.h> > > +# undef __wcslcpy > > + > > +# define SYMBOL_NAME wcslcpy > > +# include "ifunc-strlcpy.h" > > + > > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR > ()); > > +weak_alias (__wcslcpy, wcslcpy) > > +# ifdef SHARED > > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); > > +# endif > > +#endif > > -- > > 2.38.1 > > >
On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha >> <libc-alpha@sourceware.org> wrote: >> > >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. >> > --- >> > sysdeps/x86_64/multiarch/Makefile | 4 + >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ >> > 9 files changed, 627 insertions(+) >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c >> > >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile >> > index e1e894c963..7e3fc081df 100644 >> > --- a/sysdeps/x86_64/multiarch/Makefile >> > +++ b/sysdeps/x86_64/multiarch/Makefile >> > @@ -82,6 +82,8 @@ sysdep_routines += \ >> > strcpy-sse2 \ >> > strcpy-sse2-unaligned \ >> > strcspn-sse4 \ >> > + strlcpy-avx2 \ >> > + strlcpy-generic \ >> > strlen-avx2 \ >> > strlen-avx2-rtm \ >> > strlen-evex \ >> > @@ -153,6 +155,8 @@ sysdep_routines += \ >> > wcscpy-evex \ >> > wcscpy-generic \ >> > wcscpy-ssse3 \ >> > + wcslcpy-avx2 \ >> > + wcslcpy-generic \ >> > wcslen-avx2 \ >> > wcslen-avx2-rtm \ >> > wcslen-evex \ >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > index 5427ff1907..9928dee187 100644 >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> > 1, >> > __strncat_sse2_unaligned)) >> > >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ >> > + IFUNC_IMPL (i, name, strlcpy, >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, >> > + CPU_FEATURE_USABLE (AVX2), >> > + __strlcpy_avx2) >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, >> > + 1, >> > + __strlcpy_generic)) >> > + >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ >> > IFUNC_IMPL (i, name, strncpy, >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> > 1, >> > __wcscpy_generic)) >> > >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ >> > + IFUNC_IMPL (i, name, wcslcpy, >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, >> > + CPU_FEATURE_USABLE (AVX2), >> > + __wcslcpy_avx2) >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, >> > + 1, >> > + __wcslcpy_generic)) >> > + >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ >> > IFUNC_IMPL (i, name, wcsncpy, >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > new file mode 100644 >> > index 0000000000..982a30d15b >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> > @@ -0,0 +1,34 @@ >> > +/* Common definition for ifunc selections. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +#include <init-arch.h> >> > + >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; >> > + >> > +static inline void * >> > +IFUNC_SELECTOR (void) >> > +{ >> > + const struct cpu_features *cpu_features = __get_cpu_features (); >> > + >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) >> > + return OPTIMIZE (avx2); >> > + >> > + return OPTIMIZE (generic); >> > +} >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > new file mode 100644 >> > index 0000000000..cf54b1e990 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> > @@ -0,0 +1,446 @@ >> > +/* Strlcpy/wcslcpy optimized with AVX2. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +#include <isa-level.h> >> > + >> > +#if ISA_SHOULD_BUILD (3) >> > + >> > +# include <sysdep.h> >> > + >> > +# ifndef VEC_SIZE >> > +# include "x86-avx-vecs.h" >> > +# endif >> > + >> > +# ifndef STRLCPY >> > +# define STRLCPY __strlcpy_avx2 >> > +# endif >> > + >> > + >> > +# ifdef USE_AS_WCSLCPY >> > +# define CHAR_SIZE 4 >> > +# define MOVU movl >> > +# define VPCMPEQ vpcmpeqd >> > +# define VPMINU vpminud >> > +# else >> > +# define CHAR_SIZE 1 >> > +# define MOVU movb >> > +# define VPCMPEQ vpcmpeqb >> > +# define VPMINU vpminub >> > +# endif >> > + >> > +# define PMOVMSK vpmovmskb >> > +# define PAGE_SIZE 4096 >> > +# define VEC_SIZE 32 >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) >> > + >> > + .section SECTION(.text),"ax",@progbits >> > +/* Aligning entry point to 64 byte, provides better performance for >> > + one vector length string. */ >> > + >> > +ENTRY_P2ALIGN (STRLCPY, 6) >> > +# ifdef __ILP32__ >> > + /* Clear the upper 32 bits. */ >> > + movl %edx, %edx >> > +# endif >> > + >> > + /* Zero out vector register for end of string comparison. */ >> > + vpxor %VMM(0), %VMM(0), %VMM(0) >> > + /* Save source pointer for return calculation. */ >> > + mov %rsi, %r8 >> > + mov %esi, %eax >> > + sall $20, %eax >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax >> > + ja L(page_cross) >> > + >> > +L(page_cross_continue): >> > + /* Load first vector. */ >> > + VMOVU (%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + PMOVMSK %VMM(2), %eax >> > + test %eax, %eax >> > + jnz L(ret_vec_x1) >> > + >> > + test %rdx, %rdx >> > + jz L(continue_second_vector) >> > + >> > + /* Check whether we can copy full vector. */ >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(page_cross_small_vec_copy) >> > + /* Copy first vector. */ >> > + VMOVU %VMM(1), (%rdi) >> > + sub $CHAR_PER_VEC, %rdx >> > + >> > +L(continue_second_vector): >> > + /* Align RSI pointer and adjust RDI based on offset. */ >> > + mov %rsi, %rax >> > + and $-VEC_SIZE, %rsi >> > + sub %rsi, %rax >> > + sub %rax, %rdi >> > + >> > + /* Check if string already copied N char, and RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(skip_copy_alignment_fix) >> > + >> > + /* Adjust RDX for copy alignment fix. */ >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > +# endif >> > + add %rax, %rdx >> > + >> > +L(skip_copy_alignment_fix): >> > + /* Load second vector. */ >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x2) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(continue_third_vector) >> > + >> > + /* Jump below/equal(instead of below) used here, because last >> > + copy chracter must be NULL. */ >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_second_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy second vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) >> > + >> > +L(continue_third_vector): >> > + /* Load third vector. */ >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x3) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(continue_fourth_vector) >> > + >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_third_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy third vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) >> > + >> > +L(continue_fourth_vector): >> > + /* Load fourth vector. */ >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + vptest %VMM(2), %VMM(2) >> > + jnz L(ret_vec_x4) >> > + >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_4x_align) >> > + >> > + cmp $CHAR_PER_VEC, %rdx >> > + jbe L(partial_copy_fourth_vector) >> > + >> > + sub $CHAR_PER_VEC, %rdx >> > + /* Copy fourth vector. */ >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) >> > + >> > + >> > +L(loop_4x_align): >> > + /* Jump to loop if RSI is already 4 vector align. */ >> > + test $(VEC_SIZE * 4 - 1), %esi >> > + jz L(loop_4x_read) >> > + >> > + mov %rsi, %rcx >> > + >> > + /* Align RSI to 4x vector. */ >> > + and $(VEC_SIZE * -4), %rsi >> > + sub %rsi, %rcx >> > + >> > + /* Adjust RDI for RSI alignment fix. */ >> > + sub %rcx, %rdi >> > + >> > + /* Jump to loop if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_4x_read) >> > + >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rcx >> > +# endif >> > + >> > + /* Adjust RDX for RSI alignment fix. */ >> > + add %rcx, %rdx >> > + jmp L(loop_4x_read) >> > + >> > + .p2align 4,,6 >> > +L(loop_4x_vec): >> > + /* Skip copy if RDX is 0. */ >> > + test %rdx, %rdx >> > + jz L(loop_partial_copy_return) >> > + cmp $(CHAR_PER_VEC * 4), %rdx >> > + jbe L(loop_partial_copy) >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) >> > + sub $(CHAR_PER_VEC * 4), %rdx >> > + >> > +L(loop_partial_copy_return): >> > + sub $(VEC_SIZE * -4), %rsi >> > + sub $(VEC_SIZE * -4), %rdi >> > + >> > +L(loop_4x_read): >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) >> > + vptest %VMM(7), %VMM(7) >> > + >> > + jz L(loop_4x_vec) >> > + >> > + /* Check if string ends in first vector or second vector. */ >> > + lea (VEC_SIZE * 4)(%rsi), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > +# endif >> > + xor %r10, %r10 >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) >> > + vptest %VMM(6), %VMM(6) >> > + jnz L(endloop) >> > + sub $(CHAR_PER_VEC * -2), %rax >> > + mov $(CHAR_PER_VEC * 2), %r10 >> > + VMOVA %VMM(3), %VMM(1) >> > + VMOVA %VMM(4), %VMM(2) >> > + >> > +L(endloop): >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) >> > + PMOVMSK %VMM(1), %rcx >> > + PMOVMSK %VMM(2), %r9 >> > + shlq $32, %r9 >> > + orq %r9, %rcx >> > + bsf %rcx, %rcx >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rcx >> > +# endif >> > + /* At this point RAX has length to return. */ >> > + add %rcx, %rax >> > + test %rdx, %rdx >> > + jz L(ret) >> > + >> > + /* Add 1 to account for NULL character in RDX comparison. */ >> > + lea 1(%r10, %rcx), %rcx >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(loop_partial_copy): >> > + cmp $(CHAR_PER_VEC * 2), %rdx >> > + jbe L(loop_partial_first_half) >> > + /* Reload first 2 vector. */ >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> > + >> > +L(loop_partial_first_half): >> > + /* Go back 2 vector from last and use overlapping copy. >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) >> > + */ >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) >> > + xor %rdx, %rdx >> > + vptest %VMM(7), %VMM(7) >> > + jz L(loop_partial_copy_return) >> > + ret >> > + >> > + .p2align 4 >> > +L(page_cross): >> > + mov %rsi, %rcx >> > + mov %rsi, %r11 >> > + and $-VEC_SIZE, %r11 >> > + and $(VEC_SIZE - 1), %rcx >> > + VMOVA (%r11), %VMM(1) >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> > + PMOVMSK %VMM(2), %eax >> > + shr %cl, %eax >> > + jz L(page_cross_continue) >> > + >> > +L(ret_vec_x1): >> > + bsf %eax, %eax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %eax >> > +# endif >> > + /* Increment by 1 to account for NULL char. */ >> > + lea 1(%eax), %ecx >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + >> > +L(page_cross_small_vec_copy): >> > + cmp $(16 / CHAR_SIZE), %rdx >> > + jbe L(copy_8_byte_scalar) >> > + VMOVU (%rsi), %VMM_128(1) >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) >> > + VMOVU %VMM_128(1), (%rdi) >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %rdx, %rdx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_8_byte_scalar): >> > + cmp $(8 / CHAR_SIZE), %rdx >> > + jbe L(copy_4_byte_scalar) >> > + movq (%rsi), %r10 >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 >> > + movq %r10, (%rdi) >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_4_byte_scalar): >> > +# ifndef USE_AS_WCSLCPY >> > + cmp $4, %rdx >> > + jbe L(copy_2_byte_scalar) >> > +# endif >> > + movl (%rsi), %r10d >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d >> > + movl %r10d, (%rdi) >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +# ifndef USE_AS_WCSLCPY >> > +L(copy_2_byte_scalar): >> > + cmp $2, %rdx >> > + jbe L(copy_1_byte_scalar) >> > + movw (%rsi), %r10w >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w >> > + movw %r10w, (%rdi) >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > + >> > +L(copy_1_byte_scalar): >> > + MOVU (%rsi), %r10b >> > + MOVU %r10b, (%rdi) >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_second_vector) >> > + ret >> > +# endif >> > + >> > +L(ret_vec_x2): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea VEC_SIZE(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_second_vector): >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_third_vector) >> > + >> > +L(ret): >> > + ret >> > + >> > +L(ret_vec_x3): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_third_vector): >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_fourth_vector) >> > + ret >> > + >> > +L(ret_vec_x4): >> > + PMOVMSK %VMM(2), %rax >> > + bsf %rax, %rcx >> > + /* Calculate return value. */ >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax >> > + sub %r8, %rax >> > +# ifdef USE_AS_WCSLCPY >> > + shr $2, %rax >> > + shr $2, %rcx >> > +# endif >> > + inc %rcx >> > + test %rdx, %rdx >> > + jz L(ret) >> > + cmp %rdx, %rcx >> > + cmovb %rcx, %rdx >> > + >> > +L(partial_copy_fourth_vector): >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> > + xor %edx, %edx >> > + vptest %VMM(2), %VMM(2) >> > + jz L(continue_fourth_vector) >> > + ret >> > + >> > +END (STRLCPY) >> >> Is strlcpy/strlcat integratable with existing strncat impl? Had >> figured they would >> fit in the same file. > > > Hi Noah, > > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. > Well, we can put the impl there and include it from another to manage any special link cases. > --Sunil > >> >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> > new file mode 100644 >> > index 0000000000..eee3b7b086 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> > @@ -0,0 +1,25 @@ >> > +/* strlcpy generic. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > + >> > +#include <isa-level.h> >> > +#if ISA_SHOULD_BUILD (1) >> > +# define __strlcpy __strlcpy_generic >> > +# include <string/strlcpy.c> >> > + >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c >> > new file mode 100644 >> > index 0000000000..ded41fbcfb >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c >> > @@ -0,0 +1,36 @@ >> > +/* Multiple versions of strlcpy. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +/* Define multiple versions only for the definition in libc. */ >> > +#if IS_IN (libc) >> > +# define __strlcpy __redirect_strlcpy >> > +# include <string.h> >> > +# undef __strlcpy >> > + >> > +# define SYMBOL_NAME strlcpy >> > +# include "ifunc-strlcpy.h" >> > + >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); >> > +weak_alias (__strlcpy, strlcpy) >> > + >> > +# ifdef SHARED >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); >> > +# endif >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > new file mode 100644 >> > index 0000000000..dafc20ded0 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> > @@ -0,0 +1,4 @@ >> > +#define STRLCPY __wcslcpy_avx2 >> > +#define USE_AS_WCSLCPY 1 >> > + >> > +#include "strlcpy-avx2.S" >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > new file mode 100644 >> > index 0000000000..ffd3c0e846 >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> > @@ -0,0 +1,25 @@ >> > +/* wcslcpy generic. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > + >> > +#include <isa-level.h> >> > +#if ISA_SHOULD_BUILD (1) >> > +# define __wcslcpy __wcslcpy_generic >> > +# include <wcsmbs/wcslcpy.c> >> > + >> > +#endif >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c >> > new file mode 100644 >> > index 0000000000..371ef9626c >> > --- /dev/null >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c >> > @@ -0,0 +1,35 @@ >> > +/* Multiple versions of wcslcpy. >> > + All versions must be listed in ifunc-impl-list.c. >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> > + This file is part of the GNU C Library. >> > + >> > + The GNU C Library is free software; you can redistribute it and/or >> > + modify it under the terms of the GNU Lesser General Public >> > + License as published by the Free Software Foundation; either >> > + version 2.1 of the License, or (at your option) any later version. >> > + >> > + The GNU C Library is distributed in the hope that it will be useful, >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> > + Lesser General Public License for more details. >> > + >> > + You should have received a copy of the GNU Lesser General Public >> > + License along with the GNU C Library; if not, see >> > + <https://www.gnu.org/licenses/>. */ >> > + >> > +/* Define multiple versions only for the definition in libc. */ >> > +#if IS_IN (libc) >> > +# define __wcslcpy __redirect_wcslcpy >> > +# include <wchar.h> >> > +# undef __wcslcpy >> > + >> > +# define SYMBOL_NAME wcslcpy >> > +# include "ifunc-strlcpy.h" >> > + >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); >> > +weak_alias (__wcslcpy, wcslcpy) >> > +# ifdef SHARED >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); >> > +# endif >> > +#endif >> > -- >> > 2.38.1 >> >
On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > > > > > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> > wrote: > >> > >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha > >> <libc-alpha@sourceware.org> wrote: > >> > > >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. > >> > --- > >> > sysdeps/x86_64/multiarch/Makefile | 4 + > >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + > >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ > >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 > +++++++++++++++++++++ > >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ > >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ > >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + > >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ > >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ > >> > 9 files changed, 627 insertions(+) > >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c > >> > > >> > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > >> > index e1e894c963..7e3fc081df 100644 > >> > --- a/sysdeps/x86_64/multiarch/Makefile > >> > +++ b/sysdeps/x86_64/multiarch/Makefile > >> > @@ -82,6 +82,8 @@ sysdep_routines += \ > >> > strcpy-sse2 \ > >> > strcpy-sse2-unaligned \ > >> > strcspn-sse4 \ > >> > + strlcpy-avx2 \ > >> > + strlcpy-generic \ > >> > strlen-avx2 \ > >> > strlen-avx2-rtm \ > >> > strlen-evex \ > >> > @@ -153,6 +155,8 @@ sysdep_routines += \ > >> > wcscpy-evex \ > >> > wcscpy-generic \ > >> > wcscpy-ssse3 \ > >> > + wcslcpy-avx2 \ > >> > + wcslcpy-generic \ > >> > wcslen-avx2 \ > >> > wcslen-avx2-rtm \ > >> > wcslen-evex \ > >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > index 5427ff1907..9928dee187 100644 > >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > >> > 1, > >> > __strncat_sse2_unaligned)) > >> > > >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ > >> > + IFUNC_IMPL (i, name, strlcpy, > >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, > >> > + CPU_FEATURE_USABLE (AVX2), > >> > + __strlcpy_avx2) > >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, > >> > + 1, > >> > + __strlcpy_generic)) > >> > + > >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ > >> > IFUNC_IMPL (i, name, strncpy, > >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, > >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct > libc_ifunc_impl *array, > >> > 1, > >> > __wcscpy_generic)) > >> > > >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ > >> > + IFUNC_IMPL (i, name, wcslcpy, > >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, > >> > + CPU_FEATURE_USABLE (AVX2), > >> > + __wcslcpy_avx2) > >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, > >> > + 1, > >> > + __wcslcpy_generic)) > >> > + > >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ > >> > IFUNC_IMPL (i, name, wcsncpy, > >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, > >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > new file mode 100644 > >> > index 0000000000..982a30d15b > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h > >> > @@ -0,0 +1,34 @@ > >> > +/* Common definition for ifunc selections. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +#include <init-arch.h> > >> > + > >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; > >> > + > >> > +static inline void * > >> > +IFUNC_SELECTOR (void) > >> > +{ > >> > + const struct cpu_features *cpu_features = __get_cpu_features (); > >> > + > >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > >> > + return OPTIMIZE (avx2); > >> > + > >> > + return OPTIMIZE (generic); > >> > +} > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S > b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > new file mode 100644 > >> > index 0000000000..cf54b1e990 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S > >> > @@ -0,0 +1,446 @@ > >> > +/* Strlcpy/wcslcpy optimized with AVX2. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +#include <isa-level.h> > >> > + > >> > +#if ISA_SHOULD_BUILD (3) > >> > + > >> > +# include <sysdep.h> > >> > + > >> > +# ifndef VEC_SIZE > >> > +# include "x86-avx-vecs.h" > >> > +# endif > >> > + > >> > +# ifndef STRLCPY > >> > +# define STRLCPY __strlcpy_avx2 > >> > +# endif > >> > + > >> > + > >> > +# ifdef USE_AS_WCSLCPY > >> > +# define CHAR_SIZE 4 > >> > +# define MOVU movl > >> > +# define VPCMPEQ vpcmpeqd > >> > +# define VPMINU vpminud > >> > +# else > >> > +# define CHAR_SIZE 1 > >> > +# define MOVU movb > >> > +# define VPCMPEQ vpcmpeqb > >> > +# define VPMINU vpminub > >> > +# endif > >> > + > >> > +# define PMOVMSK vpmovmskb > >> > +# define PAGE_SIZE 4096 > >> > +# define VEC_SIZE 32 > >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > >> > + > >> > + .section SECTION(.text),"ax",@progbits > >> > +/* Aligning entry point to 64 byte, provides better performance for > >> > + one vector length string. */ > >> > + > >> > +ENTRY_P2ALIGN (STRLCPY, 6) > >> > +# ifdef __ILP32__ > >> > + /* Clear the upper 32 bits. */ > >> > + movl %edx, %edx > >> > +# endif > >> > + > >> > + /* Zero out vector register for end of string comparison. */ > >> > + vpxor %VMM(0), %VMM(0), %VMM(0) > >> > + /* Save source pointer for return calculation. */ > >> > + mov %rsi, %r8 > >> > + mov %esi, %eax > >> > + sall $20, %eax > >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax > >> > + ja L(page_cross) > >> > + > >> > +L(page_cross_continue): > >> > + /* Load first vector. */ > >> > + VMOVU (%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + PMOVMSK %VMM(2), %eax > >> > + test %eax, %eax > >> > + jnz L(ret_vec_x1) > >> > + > >> > + test %rdx, %rdx > >> > + jz L(continue_second_vector) > >> > + > >> > + /* Check whether we can copy full vector. */ > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(page_cross_small_vec_copy) > >> > + /* Copy first vector. */ > >> > + VMOVU %VMM(1), (%rdi) > >> > + sub $CHAR_PER_VEC, %rdx > >> > + > >> > +L(continue_second_vector): > >> > + /* Align RSI pointer and adjust RDI based on offset. */ > >> > + mov %rsi, %rax > >> > + and $-VEC_SIZE, %rsi > >> > + sub %rsi, %rax > >> > + sub %rax, %rdi > >> > + > >> > + /* Check if string already copied N char, and RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(skip_copy_alignment_fix) > >> > + > >> > + /* Adjust RDX for copy alignment fix. */ > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > +# endif > >> > + add %rax, %rdx > >> > + > >> > +L(skip_copy_alignment_fix): > >> > + /* Load second vector. */ > >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x2) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(continue_third_vector) > >> > + > >> > + /* Jump below/equal(instead of below) used here, because last > >> > + copy chracter must be NULL. */ > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_second_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy second vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > >> > + > >> > +L(continue_third_vector): > >> > + /* Load third vector. */ > >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x3) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(continue_fourth_vector) > >> > + > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_third_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy third vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) > >> > + > >> > +L(continue_fourth_vector): > >> > + /* Load fourth vector. */ > >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + vptest %VMM(2), %VMM(2) > >> > + jnz L(ret_vec_x4) > >> > + > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_4x_align) > >> > + > >> > + cmp $CHAR_PER_VEC, %rdx > >> > + jbe L(partial_copy_fourth_vector) > >> > + > >> > + sub $CHAR_PER_VEC, %rdx > >> > + /* Copy fourth vector. */ > >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) > >> > + > >> > + > >> > +L(loop_4x_align): > >> > + /* Jump to loop if RSI is already 4 vector align. */ > >> > + test $(VEC_SIZE * 4 - 1), %esi > >> > + jz L(loop_4x_read) > >> > + > >> > + mov %rsi, %rcx > >> > + > >> > + /* Align RSI to 4x vector. */ > >> > + and $(VEC_SIZE * -4), %rsi > >> > + sub %rsi, %rcx > >> > + > >> > + /* Adjust RDI for RSI alignment fix. */ > >> > + sub %rcx, %rdi > >> > + > >> > + /* Jump to loop if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_4x_read) > >> > + > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rcx > >> > +# endif > >> > + > >> > + /* Adjust RDX for RSI alignment fix. */ > >> > + add %rcx, %rdx > >> > + jmp L(loop_4x_read) > >> > + > >> > + .p2align 4,,6 > >> > +L(loop_4x_vec): > >> > + /* Skip copy if RDX is 0. */ > >> > + test %rdx, %rdx > >> > + jz L(loop_partial_copy_return) > >> > + cmp $(CHAR_PER_VEC * 4), %rdx > >> > + jbe L(loop_partial_copy) > >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) > >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) > >> > + sub $(CHAR_PER_VEC * 4), %rdx > >> > + > >> > +L(loop_partial_copy_return): > >> > + sub $(VEC_SIZE * -4), %rsi > >> > + sub $(VEC_SIZE * -4), %rdi > >> > + > >> > +L(loop_4x_read): > >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) > >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) > >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) > >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) > >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) > >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) > >> > + vptest %VMM(7), %VMM(7) > >> > + > >> > + jz L(loop_4x_vec) > >> > + > >> > + /* Check if string ends in first vector or second vector. */ > >> > + lea (VEC_SIZE * 4)(%rsi), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > +# endif > >> > + xor %r10, %r10 > >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) > >> > + vptest %VMM(6), %VMM(6) > >> > + jnz L(endloop) > >> > + sub $(CHAR_PER_VEC * -2), %rax > >> > + mov $(CHAR_PER_VEC * 2), %r10 > >> > + VMOVA %VMM(3), %VMM(1) > >> > + VMOVA %VMM(4), %VMM(2) > >> > + > >> > +L(endloop): > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) > >> > + PMOVMSK %VMM(1), %rcx > >> > + PMOVMSK %VMM(2), %r9 > >> > + shlq $32, %r9 > >> > + orq %r9, %rcx > >> > + bsf %rcx, %rcx > >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rcx > >> > +# endif > >> > + /* At this point RAX has length to return. */ > >> > + add %rcx, %rax > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + > >> > + /* Add 1 to account for NULL character in RDX comparison. */ > >> > + lea 1(%r10, %rcx), %rcx > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(loop_partial_copy): > >> > + cmp $(CHAR_PER_VEC * 2), %rdx > >> > + jbe L(loop_partial_first_half) > >> > + /* Reload first 2 vector. */ > >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) > >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) > >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) > >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) > >> > + > >> > +L(loop_partial_first_half): > >> > + /* Go back 2 vector from last and use overlapping copy. > >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) > >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) > >> > + */ > >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) > >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) > >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %rdx, %rdx > >> > + vptest %VMM(7), %VMM(7) > >> > + jz L(loop_partial_copy_return) > >> > + ret > >> > + > >> > + .p2align 4 > >> > +L(page_cross): > >> > + mov %rsi, %rcx > >> > + mov %rsi, %r11 > >> > + and $-VEC_SIZE, %r11 > >> > + and $(VEC_SIZE - 1), %rcx > >> > + VMOVA (%r11), %VMM(1) > >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) > >> > + PMOVMSK %VMM(2), %eax > >> > + shr %cl, %eax > >> > + jz L(page_cross_continue) > >> > + > >> > +L(ret_vec_x1): > >> > + bsf %eax, %eax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %eax > >> > +# endif > >> > + /* Increment by 1 to account for NULL char. */ > >> > + lea 1(%eax), %ecx > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + > >> > +L(page_cross_small_vec_copy): > >> > + cmp $(16 / CHAR_SIZE), %rdx > >> > + jbe L(copy_8_byte_scalar) > >> > + VMOVU (%rsi), %VMM_128(1) > >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) > >> > + VMOVU %VMM_128(1), (%rdi) > >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %rdx, %rdx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_8_byte_scalar): > >> > + cmp $(8 / CHAR_SIZE), %rdx > >> > + jbe L(copy_4_byte_scalar) > >> > + movq (%rsi), %r10 > >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 > >> > + movq %r10, (%rdi) > >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_4_byte_scalar): > >> > +# ifndef USE_AS_WCSLCPY > >> > + cmp $4, %rdx > >> > + jbe L(copy_2_byte_scalar) > >> > +# endif > >> > + movl (%rsi), %r10d > >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d > >> > + movl %r10d, (%rdi) > >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +# ifndef USE_AS_WCSLCPY > >> > +L(copy_2_byte_scalar): > >> > + cmp $2, %rdx > >> > + jbe L(copy_1_byte_scalar) > >> > + movw (%rsi), %r10w > >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w > >> > + movw %r10w, (%rdi) > >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > + > >> > +L(copy_1_byte_scalar): > >> > + MOVU (%rsi), %r10b > >> > + MOVU %r10b, (%rdi) > >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_second_vector) > >> > + ret > >> > +# endif > >> > + > >> > +L(ret_vec_x2): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea VEC_SIZE(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_second_vector): > >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_third_vector) > >> > + > >> > +L(ret): > >> > + ret > >> > + > >> > +L(ret_vec_x3): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_third_vector): > >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_fourth_vector) > >> > + ret > >> > + > >> > +L(ret_vec_x4): > >> > + PMOVMSK %VMM(2), %rax > >> > + bsf %rax, %rcx > >> > + /* Calculate return value. */ > >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax > >> > + sub %r8, %rax > >> > +# ifdef USE_AS_WCSLCPY > >> > + shr $2, %rax > >> > + shr $2, %rcx > >> > +# endif > >> > + inc %rcx > >> > + test %rdx, %rdx > >> > + jz L(ret) > >> > + cmp %rdx, %rcx > >> > + cmovb %rcx, %rdx > >> > + > >> > +L(partial_copy_fourth_vector): > >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, > CHAR_SIZE) > >> > + xor %edx, %edx > >> > + vptest %VMM(2), %VMM(2) > >> > + jz L(continue_fourth_vector) > >> > + ret > >> > + > >> > +END (STRLCPY) > >> > >> Is strlcpy/strlcat integratable with existing strncat impl? Had > >> figured they would > >> fit in the same file. > > > > > > Hi Noah, > > > > It may not be a good idea to put strlcpy/strlcat in the existing > strncpy/strnat impl file, > > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. > > > Well, we can put the impl there and include it from another to manage > any special > link cases. > Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38, to avoid any future strncpy backporting complications, it is better to keep them in separate files for now. > > --Sunil > > > >> > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c > b/sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > new file mode 100644 > >> > index 0000000000..eee3b7b086 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c > >> > @@ -0,0 +1,25 @@ > >> > +/* strlcpy generic. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > + > >> > +#include <isa-level.h> > >> > +#if ISA_SHOULD_BUILD (1) > >> > +# define __strlcpy __strlcpy_generic > >> > +# include <string/strlcpy.c> > >> > + > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c > b/sysdeps/x86_64/multiarch/strlcpy.c > >> > new file mode 100644 > >> > index 0000000000..ded41fbcfb > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c > >> > @@ -0,0 +1,36 @@ > >> > +/* Multiple versions of strlcpy. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +/* Define multiple versions only for the definition in libc. */ > >> > +#if IS_IN (libc) > >> > +# define __strlcpy __redirect_strlcpy > >> > +# include <string.h> > >> > +# undef __strlcpy > >> > + > >> > +# define SYMBOL_NAME strlcpy > >> > +# include "ifunc-strlcpy.h" > >> > + > >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR > ()); > >> > +weak_alias (__strlcpy, strlcpy) > >> > + > >> > +# ifdef SHARED > >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) > >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ > (strlcpy); > >> > +# endif > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > new file mode 100644 > >> > index 0000000000..dafc20ded0 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S > >> > @@ -0,0 +1,4 @@ > >> > +#define STRLCPY __wcslcpy_avx2 > >> > +#define USE_AS_WCSLCPY 1 > >> > + > >> > +#include "strlcpy-avx2.S" > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c > b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > new file mode 100644 > >> > index 0000000000..ffd3c0e846 > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c > >> > @@ -0,0 +1,25 @@ > >> > +/* wcslcpy generic. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > + > >> > +#include <isa-level.h> > >> > +#if ISA_SHOULD_BUILD (1) > >> > +# define __wcslcpy __wcslcpy_generic > >> > +# include <wcsmbs/wcslcpy.c> > >> > + > >> > +#endif > >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c > b/sysdeps/x86_64/multiarch/wcslcpy.c > >> > new file mode 100644 > >> > index 0000000000..371ef9626c > >> > --- /dev/null > >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c > >> > @@ -0,0 +1,35 @@ > >> > +/* Multiple versions of wcslcpy. > >> > + All versions must be listed in ifunc-impl-list.c. > >> > + Copyright (C) 2023 Free Software Foundation, Inc. > >> > + This file is part of the GNU C Library. > >> > + > >> > + The GNU C Library is free software; you can redistribute it and/or > >> > + modify it under the terms of the GNU Lesser General Public > >> > + License as published by the Free Software Foundation; either > >> > + version 2.1 of the License, or (at your option) any later version. > >> > + > >> > + The GNU C Library is distributed in the hope that it will be > useful, > >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of > >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > >> > + Lesser General Public License for more details. > >> > + > >> > + You should have received a copy of the GNU Lesser General Public > >> > + License along with the GNU C Library; if not, see > >> > + <https://www.gnu.org/licenses/>. */ > >> > + > >> > +/* Define multiple versions only for the definition in libc. */ > >> > +#if IS_IN (libc) > >> > +# define __wcslcpy __redirect_wcslcpy > >> > +# include <wchar.h> > >> > +# undef __wcslcpy > >> > + > >> > +# define SYMBOL_NAME wcslcpy > >> > +# include "ifunc-strlcpy.h" > >> > + > >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR > ()); > >> > +weak_alias (__wcslcpy, wcslcpy) > >> > +# ifdef SHARED > >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) > >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ > (wcslcpy); > >> > +# endif > >> > +#endif > >> > -- > >> > 2.38.1 > >> > >
On Sun, Jul 2, 2023 at 8:04 PM Sunil Pandey <skpgkp2@gmail.com> wrote: > > > > On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote: >> > >> > >> > >> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: >> >> >> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha >> >> <libc-alpha@sourceware.org> wrote: >> >> > >> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2. >> >> > --- >> >> > sysdeps/x86_64/multiarch/Makefile | 4 + >> >> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 18 + >> >> > sysdeps/x86_64/multiarch/ifunc-strlcpy.h | 34 ++ >> >> > sysdeps/x86_64/multiarch/strlcpy-avx2.S | 446 +++++++++++++++++++++ >> >> > sysdeps/x86_64/multiarch/strlcpy-generic.c | 25 ++ >> >> > sysdeps/x86_64/multiarch/strlcpy.c | 36 ++ >> >> > sysdeps/x86_64/multiarch/wcslcpy-avx2.S | 4 + >> >> > sysdeps/x86_64/multiarch/wcslcpy-generic.c | 25 ++ >> >> > sysdeps/x86_64/multiarch/wcslcpy.c | 35 ++ >> >> > 9 files changed, 627 insertions(+) >> >> > create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c >> >> > >> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile >> >> > index e1e894c963..7e3fc081df 100644 >> >> > --- a/sysdeps/x86_64/multiarch/Makefile >> >> > +++ b/sysdeps/x86_64/multiarch/Makefile >> >> > @@ -82,6 +82,8 @@ sysdep_routines += \ >> >> > strcpy-sse2 \ >> >> > strcpy-sse2-unaligned \ >> >> > strcspn-sse4 \ >> >> > + strlcpy-avx2 \ >> >> > + strlcpy-generic \ >> >> > strlen-avx2 \ >> >> > strlen-avx2-rtm \ >> >> > strlen-evex \ >> >> > @@ -153,6 +155,8 @@ sysdep_routines += \ >> >> > wcscpy-evex \ >> >> > wcscpy-generic \ >> >> > wcscpy-ssse3 \ >> >> > + wcslcpy-avx2 \ >> >> > + wcslcpy-generic \ >> >> > wcslen-avx2 \ >> >> > wcslen-avx2-rtm \ >> >> > wcslen-evex \ >> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > index 5427ff1907..9928dee187 100644 >> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c >> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> >> > 1, >> >> > __strncat_sse2_unaligned)) >> >> > >> >> > + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ >> >> > + IFUNC_IMPL (i, name, strlcpy, >> >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, >> >> > + CPU_FEATURE_USABLE (AVX2), >> >> > + __strlcpy_avx2) >> >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, >> >> > + 1, >> >> > + __strlcpy_generic)) >> >> > + >> >> > /* Support sysdeps/x86_64/multiarch/strncpy.c. */ >> >> > IFUNC_IMPL (i, name, strncpy, >> >> > X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, >> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, >> >> > 1, >> >> > __wcscpy_generic)) >> >> > >> >> > + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ >> >> > + IFUNC_IMPL (i, name, wcslcpy, >> >> > + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, >> >> > + CPU_FEATURE_USABLE (AVX2), >> >> > + __wcslcpy_avx2) >> >> > + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, >> >> > + 1, >> >> > + __wcslcpy_generic)) >> >> > + >> >> > /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ >> >> > IFUNC_IMPL (i, name, wcsncpy, >> >> > X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, >> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > new file mode 100644 >> >> > index 0000000000..982a30d15b >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h >> >> > @@ -0,0 +1,34 @@ >> >> > +/* Common definition for ifunc selections. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +#include <init-arch.h> >> >> > + >> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; >> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; >> >> > + >> >> > +static inline void * >> >> > +IFUNC_SELECTOR (void) >> >> > +{ >> >> > + const struct cpu_features *cpu_features = __get_cpu_features (); >> >> > + >> >> > + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) >> >> > + return OPTIMIZE (avx2); >> >> > + >> >> > + return OPTIMIZE (generic); >> >> > +} >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > new file mode 100644 >> >> > index 0000000000..cf54b1e990 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S >> >> > @@ -0,0 +1,446 @@ >> >> > +/* Strlcpy/wcslcpy optimized with AVX2. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +#include <isa-level.h> >> >> > + >> >> > +#if ISA_SHOULD_BUILD (3) >> >> > + >> >> > +# include <sysdep.h> >> >> > + >> >> > +# ifndef VEC_SIZE >> >> > +# include "x86-avx-vecs.h" >> >> > +# endif >> >> > + >> >> > +# ifndef STRLCPY >> >> > +# define STRLCPY __strlcpy_avx2 >> >> > +# endif >> >> > + >> >> > + >> >> > +# ifdef USE_AS_WCSLCPY >> >> > +# define CHAR_SIZE 4 >> >> > +# define MOVU movl >> >> > +# define VPCMPEQ vpcmpeqd >> >> > +# define VPMINU vpminud >> >> > +# else >> >> > +# define CHAR_SIZE 1 >> >> > +# define MOVU movb >> >> > +# define VPCMPEQ vpcmpeqb >> >> > +# define VPMINU vpminub >> >> > +# endif >> >> > + >> >> > +# define PMOVMSK vpmovmskb >> >> > +# define PAGE_SIZE 4096 >> >> > +# define VEC_SIZE 32 >> >> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) >> >> > + >> >> > + .section SECTION(.text),"ax",@progbits >> >> > +/* Aligning entry point to 64 byte, provides better performance for >> >> > + one vector length string. */ >> >> > + >> >> > +ENTRY_P2ALIGN (STRLCPY, 6) >> >> > +# ifdef __ILP32__ >> >> > + /* Clear the upper 32 bits. */ >> >> > + movl %edx, %edx >> >> > +# endif >> >> > + >> >> > + /* Zero out vector register for end of string comparison. */ >> >> > + vpxor %VMM(0), %VMM(0), %VMM(0) >> >> > + /* Save source pointer for return calculation. */ >> >> > + mov %rsi, %r8 >> >> > + mov %esi, %eax >> >> > + sall $20, %eax >> >> > + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax >> >> > + ja L(page_cross) >> >> > + >> >> > +L(page_cross_continue): >> >> > + /* Load first vector. */ >> >> > + VMOVU (%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + PMOVMSK %VMM(2), %eax >> >> > + test %eax, %eax >> >> > + jnz L(ret_vec_x1) >> >> > + >> >> > + test %rdx, %rdx >> >> > + jz L(continue_second_vector) >> >> > + >> >> > + /* Check whether we can copy full vector. */ >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(page_cross_small_vec_copy) >> >> > + /* Copy first vector. */ >> >> > + VMOVU %VMM(1), (%rdi) >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + >> >> > +L(continue_second_vector): >> >> > + /* Align RSI pointer and adjust RDI based on offset. */ >> >> > + mov %rsi, %rax >> >> > + and $-VEC_SIZE, %rsi >> >> > + sub %rsi, %rax >> >> > + sub %rax, %rdi >> >> > + >> >> > + /* Check if string already copied N char, and RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(skip_copy_alignment_fix) >> >> > + >> >> > + /* Adjust RDX for copy alignment fix. */ >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > +# endif >> >> > + add %rax, %rdx >> >> > + >> >> > +L(skip_copy_alignment_fix): >> >> > + /* Load second vector. */ >> >> > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x2) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(continue_third_vector) >> >> > + >> >> > + /* Jump below/equal(instead of below) used here, because last >> >> > + copy chracter must be NULL. */ >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_second_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy second vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) >> >> > + >> >> > +L(continue_third_vector): >> >> > + /* Load third vector. */ >> >> > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x3) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(continue_fourth_vector) >> >> > + >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_third_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy third vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) >> >> > + >> >> > +L(continue_fourth_vector): >> >> > + /* Load fourth vector. */ >> >> > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jnz L(ret_vec_x4) >> >> > + >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_4x_align) >> >> > + >> >> > + cmp $CHAR_PER_VEC, %rdx >> >> > + jbe L(partial_copy_fourth_vector) >> >> > + >> >> > + sub $CHAR_PER_VEC, %rdx >> >> > + /* Copy fourth vector. */ >> >> > + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) >> >> > + >> >> > + >> >> > +L(loop_4x_align): >> >> > + /* Jump to loop if RSI is already 4 vector align. */ >> >> > + test $(VEC_SIZE * 4 - 1), %esi >> >> > + jz L(loop_4x_read) >> >> > + >> >> > + mov %rsi, %rcx >> >> > + >> >> > + /* Align RSI to 4x vector. */ >> >> > + and $(VEC_SIZE * -4), %rsi >> >> > + sub %rsi, %rcx >> >> > + >> >> > + /* Adjust RDI for RSI alignment fix. */ >> >> > + sub %rcx, %rdi >> >> > + >> >> > + /* Jump to loop if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_4x_read) >> >> > + >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + >> >> > + /* Adjust RDX for RSI alignment fix. */ >> >> > + add %rcx, %rdx >> >> > + jmp L(loop_4x_read) >> >> > + >> >> > + .p2align 4,,6 >> >> > +L(loop_4x_vec): >> >> > + /* Skip copy if RDX is 0. */ >> >> > + test %rdx, %rdx >> >> > + jz L(loop_partial_copy_return) >> >> > + cmp $(CHAR_PER_VEC * 4), %rdx >> >> > + jbe L(loop_partial_copy) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> >> > + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) >> >> > + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) >> >> > + sub $(CHAR_PER_VEC * 4), %rdx >> >> > + >> >> > +L(loop_partial_copy_return): >> >> > + sub $(VEC_SIZE * -4), %rsi >> >> > + sub $(VEC_SIZE * -4), %rdi >> >> > + >> >> > +L(loop_4x_read): >> >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> >> > + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) >> >> > + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) >> >> > + VPMINU %VMM(1), %VMM(2), %VMM(5) >> >> > + VPMINU %VMM(3), %VMM(4), %VMM(6) >> >> > + VPMINU %VMM(5), %VMM(6), %VMM(7) >> >> > + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) >> >> > + vptest %VMM(7), %VMM(7) >> >> > + >> >> > + jz L(loop_4x_vec) >> >> > + >> >> > + /* Check if string ends in first vector or second vector. */ >> >> > + lea (VEC_SIZE * 4)(%rsi), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > +# endif >> >> > + xor %r10, %r10 >> >> > + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) >> >> > + vptest %VMM(6), %VMM(6) >> >> > + jnz L(endloop) >> >> > + sub $(CHAR_PER_VEC * -2), %rax >> >> > + mov $(CHAR_PER_VEC * 2), %r10 >> >> > + VMOVA %VMM(3), %VMM(1) >> >> > + VMOVA %VMM(4), %VMM(2) >> >> > + >> >> > +L(endloop): >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) >> >> > + PMOVMSK %VMM(1), %rcx >> >> > + PMOVMSK %VMM(2), %r9 >> >> > + shlq $32, %r9 >> >> > + orq %r9, %rcx >> >> > + bsf %rcx, %rcx >> >> > + /* Shift RCX by 2, VPMOVMSK has only byte version. */ >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + /* At this point RAX has length to return. */ >> >> > + add %rcx, %rax >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + >> >> > + /* Add 1 to account for NULL character in RDX comparison. */ >> >> > + lea 1(%r10, %rcx), %rcx >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(loop_partial_copy): >> >> > + cmp $(CHAR_PER_VEC * 2), %rdx >> >> > + jbe L(loop_partial_first_half) >> >> > + /* Reload first 2 vector. */ >> >> > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) >> >> > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) >> >> > + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) >> >> > + >> >> > +L(loop_partial_first_half): >> >> > + /* Go back 2 vector from last and use overlapping copy. >> >> > + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) >> >> > + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) >> >> > + */ >> >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) >> >> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) >> >> > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> >> > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %rdx, %rdx >> >> > + vptest %VMM(7), %VMM(7) >> >> > + jz L(loop_partial_copy_return) >> >> > + ret >> >> > + >> >> > + .p2align 4 >> >> > +L(page_cross): >> >> > + mov %rsi, %rcx >> >> > + mov %rsi, %r11 >> >> > + and $-VEC_SIZE, %r11 >> >> > + and $(VEC_SIZE - 1), %rcx >> >> > + VMOVA (%r11), %VMM(1) >> >> > + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) >> >> > + PMOVMSK %VMM(2), %eax >> >> > + shr %cl, %eax >> >> > + jz L(page_cross_continue) >> >> > + >> >> > +L(ret_vec_x1): >> >> > + bsf %eax, %eax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %eax >> >> > +# endif >> >> > + /* Increment by 1 to account for NULL char. */ >> >> > + lea 1(%eax), %ecx >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + >> >> > +L(page_cross_small_vec_copy): >> >> > + cmp $(16 / CHAR_SIZE), %rdx >> >> > + jbe L(copy_8_byte_scalar) >> >> > + VMOVU (%rsi), %VMM_128(1) >> >> > + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) >> >> > + VMOVU %VMM_128(1), (%rdi) >> >> > + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %rdx, %rdx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_8_byte_scalar): >> >> > + cmp $(8 / CHAR_SIZE), %rdx >> >> > + jbe L(copy_4_byte_scalar) >> >> > + movq (%rsi), %r10 >> >> > + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 >> >> > + movq %r10, (%rdi) >> >> > + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_4_byte_scalar): >> >> > +# ifndef USE_AS_WCSLCPY >> >> > + cmp $4, %rdx >> >> > + jbe L(copy_2_byte_scalar) >> >> > +# endif >> >> > + movl (%rsi), %r10d >> >> > + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d >> >> > + movl %r10d, (%rdi) >> >> > + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +# ifndef USE_AS_WCSLCPY >> >> > +L(copy_2_byte_scalar): >> >> > + cmp $2, %rdx >> >> > + jbe L(copy_1_byte_scalar) >> >> > + movw (%rsi), %r10w >> >> > + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w >> >> > + movw %r10w, (%rdi) >> >> > + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > + >> >> > +L(copy_1_byte_scalar): >> >> > + MOVU (%rsi), %r10b >> >> > + MOVU %r10b, (%rdi) >> >> > + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_second_vector) >> >> > + ret >> >> > +# endif >> >> > + >> >> > +L(ret_vec_x2): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea VEC_SIZE(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_second_vector): >> >> > + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_third_vector) >> >> > + >> >> > +L(ret): >> >> > + ret >> >> > + >> >> > +L(ret_vec_x3): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_third_vector): >> >> > + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_fourth_vector) >> >> > + ret >> >> > + >> >> > +L(ret_vec_x4): >> >> > + PMOVMSK %VMM(2), %rax >> >> > + bsf %rax, %rcx >> >> > + /* Calculate return value. */ >> >> > + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax >> >> > + sub %r8, %rax >> >> > +# ifdef USE_AS_WCSLCPY >> >> > + shr $2, %rax >> >> > + shr $2, %rcx >> >> > +# endif >> >> > + inc %rcx >> >> > + test %rdx, %rdx >> >> > + jz L(ret) >> >> > + cmp %rdx, %rcx >> >> > + cmovb %rcx, %rdx >> >> > + >> >> > +L(partial_copy_fourth_vector): >> >> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) >> >> > + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) >> >> > + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) >> >> > + xor %edx, %edx >> >> > + vptest %VMM(2), %VMM(2) >> >> > + jz L(continue_fourth_vector) >> >> > + ret >> >> > + >> >> > +END (STRLCPY) >> >> >> >> Is strlcpy/strlcat integratable with existing strncat impl? Had >> >> figured they would >> >> fit in the same file. >> > >> > >> > Hi Noah, >> > >> > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file, >> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI. >> > >> Well, we can put the impl there and include it from another to manage >> any special >> link cases. > > > Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38, > to avoid any future strncpy backporting complications, it is better to keep them in separate > files for now. > I get that, but can't we just have an impl file that implements all the functions logic. It would only build strl* if its included to (similar to how strlen avx512 impl is currently setup). >> >> > --Sunil >> > >> >> >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > new file mode 100644 >> >> > index 0000000000..eee3b7b086 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c >> >> > @@ -0,0 +1,25 @@ >> >> > +/* strlcpy generic. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > + >> >> > +#include <isa-level.h> >> >> > +#if ISA_SHOULD_BUILD (1) >> >> > +# define __strlcpy __strlcpy_generic >> >> > +# include <string/strlcpy.c> >> >> > + >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c >> >> > new file mode 100644 >> >> > index 0000000000..ded41fbcfb >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c >> >> > @@ -0,0 +1,36 @@ >> >> > +/* Multiple versions of strlcpy. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +/* Define multiple versions only for the definition in libc. */ >> >> > +#if IS_IN (libc) >> >> > +# define __strlcpy __redirect_strlcpy >> >> > +# include <string.h> >> >> > +# undef __strlcpy >> >> > + >> >> > +# define SYMBOL_NAME strlcpy >> >> > +# include "ifunc-strlcpy.h" >> >> > + >> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); >> >> > +weak_alias (__strlcpy, strlcpy) >> >> > + >> >> > +# ifdef SHARED >> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) >> >> > + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); >> >> > +# endif >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > new file mode 100644 >> >> > index 0000000000..dafc20ded0 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S >> >> > @@ -0,0 +1,4 @@ >> >> > +#define STRLCPY __wcslcpy_avx2 >> >> > +#define USE_AS_WCSLCPY 1 >> >> > + >> >> > +#include "strlcpy-avx2.S" >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > new file mode 100644 >> >> > index 0000000000..ffd3c0e846 >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c >> >> > @@ -0,0 +1,25 @@ >> >> > +/* wcslcpy generic. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > + >> >> > +#include <isa-level.h> >> >> > +#if ISA_SHOULD_BUILD (1) >> >> > +# define __wcslcpy __wcslcpy_generic >> >> > +# include <wcsmbs/wcslcpy.c> >> >> > + >> >> > +#endif >> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c >> >> > new file mode 100644 >> >> > index 0000000000..371ef9626c >> >> > --- /dev/null >> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c >> >> > @@ -0,0 +1,35 @@ >> >> > +/* Multiple versions of wcslcpy. >> >> > + All versions must be listed in ifunc-impl-list.c. >> >> > + Copyright (C) 2023 Free Software Foundation, Inc. >> >> > + This file is part of the GNU C Library. >> >> > + >> >> > + The GNU C Library is free software; you can redistribute it and/or >> >> > + modify it under the terms of the GNU Lesser General Public >> >> > + License as published by the Free Software Foundation; either >> >> > + version 2.1 of the License, or (at your option) any later version. >> >> > + >> >> > + The GNU C Library is distributed in the hope that it will be useful, >> >> > + but WITHOUT ANY WARRANTY; without even the implied warranty of >> >> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> >> > + Lesser General Public License for more details. >> >> > + >> >> > + You should have received a copy of the GNU Lesser General Public >> >> > + License along with the GNU C Library; if not, see >> >> > + <https://www.gnu.org/licenses/>. */ >> >> > + >> >> > +/* Define multiple versions only for the definition in libc. */ >> >> > +#if IS_IN (libc) >> >> > +# define __wcslcpy __redirect_wcslcpy >> >> > +# include <wchar.h> >> >> > +# undef __wcslcpy >> >> > + >> >> > +# define SYMBOL_NAME wcslcpy >> >> > +# include "ifunc-strlcpy.h" >> >> > + >> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); >> >> > +weak_alias (__wcslcpy, wcslcpy) >> >> > +# ifdef SHARED >> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) >> >> > + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); >> >> > +# endif >> >> > +#endif >> >> > -- >> >> > 2.38.1 >> >> >
On 30/06/23 18:27, Paul Eggert wrote: > On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote: >> Think we should at the very least wait for the generic strlcpy codes >> to land first. > > Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit. I tend to agree, although these are now added in next POSIX my understanding is there are still not encouraged to be used due multiple shortcoming on previous discussion.
On 2023-06-30 15:21, Sunil Pandey wrote: > Attached is strcpy/wcslcpy microbenchmark data based on Noah > strlcpy/wcslcpy microbenchmark patch. Although it's helpful to know that the proposed patch improves microbenchmark scores, that's not enough to justify it. Let's see benchmarks of real programs. If they don't show significant wins, let's not bother. Programs that use strlcpy, by and large, don't use it in performance-sensitive areas, and their developers and users are far more worried about security than about performance. Making the implementation harder to audit will likely be a net negative for these applications. This doesn't sound a like a win at all. Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?
On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote: > > On 2023-06-30 15:21, Sunil Pandey wrote: > > Attached is strcpy/wcslcpy microbenchmark data based on Noah > > strlcpy/wcslcpy microbenchmark patch. > > Although it's helpful to know that the proposed patch improves > microbenchmark scores, that's not enough to justify it. Let's see > benchmarks of real programs. If they don't show significant wins, let's > not bother. > > Programs that use strlcpy, by and large, don't use it in > performance-sensitive areas, and their developers and users are far more > worried about security than about performance. Making the implementation > harder to audit will likely be a net negative for these applications. > This doesn't sound a like a win at all. > > Plus, who uses wcslcpy? Why bother to tune it if nobody uses it? Think we should look into dropping optimized strcpy/wcscpy family in general? For the most part don't see them in perf sensitive areas anyways (generally people that care about perf maintain the length and use mem* functions).
On 03/07/23 15:40, Noah Goldstein via Libc-alpha wrote: > On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote: >> >> On 2023-06-30 15:21, Sunil Pandey wrote: >>> Attached is strcpy/wcslcpy microbenchmark data based on Noah >>> strlcpy/wcslcpy microbenchmark patch. >> >> Although it's helpful to know that the proposed patch improves >> microbenchmark scores, that's not enough to justify it. Let's see >> benchmarks of real programs. If they don't show significant wins, let's >> not bother. >> >> Programs that use strlcpy, by and large, don't use it in >> performance-sensitive areas, and their developers and users are far more >> worried about security than about performance. Making the implementation >> harder to audit will likely be a net negative for these applications. >> This doesn't sound a like a win at all. >> >> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it? > > Think we should look into dropping optimized strcpy/wcscpy family > in general? For the most part don't see them in perf sensitive areas > anyways (generally people that care about perf maintain the length > and use mem* functions). I will go for it, these interface are provided mainly to comply with standards and for x86 it adds only more maintenance.
On 2023-07-03 11:40, Noah Goldstein wrote: > Think we should look into dropping optimized strcpy/wcscpy family > in general? For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink. strcpy optimization might be worth keeping, as it's used so much more. Measurements of real programs would help decide. In the meantime inertia suggests that when in doubt, leave it alone. For strlcpy it's an easy call: don't optimize unless realistic benchmarks show it's a win.
On 7/3/23 23:14, Paul Eggert wrote: > On 2023-07-03 11:40, Noah Goldstein wrote: >> Think we should look into dropping optimized strcpy/wcscpy family >> in general? > > For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink. > > strcpy optimization might be worth keeping, as it's used so much more. > Measurements of real programs would help decide. In the meantime > inertia suggests that when in doubt, leave it alone. > > For strlcpy it's an easy call: don't optimize unless realistic > benchmarks show it's a win. I guess it depends on just how much people use BSD software on Linux, because if you're looking at the BSDs the amount of usage of strlcpy is just absurdly massive - OpenBSD's tree has 4997 occurences of it, when memcpy is present 13470 times. That still means memcpy is used 3 times as often, but the idea that strlcpy is so popular as to be used to a remotely comparable degree is itself kind of astonishing.
On 2023-07-03 15:04, Gabriel Ravier wrote:
> OpenBSD's tree has 4997 occurrences of it
Many years ago the OpenBSD team went through its source code and
replaced uses of strcpy with strlcpy, without much thought involved and
even introducing problems in the process. I expect that not much of this
code is used elsewhere and it's not that relevant to glibc. Of the
little OpenBSDish code that is relevant (notably OpenSSH) I expect the
performance difference to be so small as to not be worth optimizating
glibc. Real-worldish benchmarks could help check this.
On Jul 03 2023, Paul Eggert wrote: > On 2023-07-03 15:04, Gabriel Ravier wrote: >> OpenBSD's tree has 4997 occurrences of it > > Many years ago the OpenBSD team went through its source code and replaced > uses of strcpy with strlcpy, without much thought involved and even > introducing problems in the process. In the Linux kernel sources all uses of strlcpy are being erased, because the developers have realized how crappy that interface is.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e1e894c963..7e3fc081df 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -82,6 +82,8 @@ sysdep_routines += \ strcpy-sse2 \ strcpy-sse2-unaligned \ strcspn-sse4 \ + strlcpy-avx2 \ + strlcpy-generic \ strlen-avx2 \ strlen-avx2-rtm \ strlen-evex \ @@ -153,6 +155,8 @@ sysdep_routines += \ wcscpy-evex \ wcscpy-generic \ wcscpy-ssse3 \ + wcslcpy-avx2 \ + wcslcpy-generic \ wcslen-avx2 \ wcslen-avx2-rtm \ wcslen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 5427ff1907..9928dee187 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 1, __strncat_sse2_unaligned)) + /* Support sysdeps/x86_64/multiarch/strlcpy.c. */ + IFUNC_IMPL (i, name, strlcpy, + X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy, + CPU_FEATURE_USABLE (AVX2), + __strlcpy_avx2) + X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy, + 1, + __strlcpy_generic)) + /* Support sysdeps/x86_64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy, @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, 1, __wcscpy_generic)) + /* Support sysdeps/x86_64/multiarch/wcslcpy.c. */ + IFUNC_IMPL (i, name, wcslcpy, + X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy, + CPU_FEATURE_USABLE (AVX2), + __wcslcpy_avx2) + X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy, + 1, + __wcslcpy_generic)) + /* Support sysdeps/x86_64/multiarch/wcsncpy.c. */ IFUNC_IMPL (i, name, wcsncpy, X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy, diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h new file mode 100644 index 0000000000..982a30d15b --- /dev/null +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h @@ -0,0 +1,34 @@ +/* Common definition for ifunc selections. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features *cpu_features = __get_cpu_features (); + + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + return OPTIMIZE (avx2); + + return OPTIMIZE (generic); +} diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S new file mode 100644 index 0000000000..cf54b1e990 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S @@ -0,0 +1,446 @@ +/* Strlcpy/wcslcpy optimized with AVX2. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (3) + +# include <sysdep.h> + +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif + +# ifndef STRLCPY +# define STRLCPY __strlcpy_avx2 +# endif + + +# ifdef USE_AS_WCSLCPY +# define CHAR_SIZE 4 +# define MOVU movl +# define VPCMPEQ vpcmpeqd +# define VPMINU vpminud +# else +# define CHAR_SIZE 1 +# define MOVU movb +# define VPCMPEQ vpcmpeqb +# define VPMINU vpminub +# endif + +# define PMOVMSK vpmovmskb +# define PAGE_SIZE 4096 +# define VEC_SIZE 32 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ + +ENTRY_P2ALIGN (STRLCPY, 6) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +# endif + + /* Zero out vector register for end of string comparison. */ + vpxor %VMM(0), %VMM(0), %VMM(0) + /* Save source pointer for return calculation. */ + mov %rsi, %r8 + mov %esi, %eax + sall $20, %eax + cmpl $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax + ja L(page_cross) + +L(page_cross_continue): + /* Load first vector. */ + VMOVU (%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + PMOVMSK %VMM(2), %eax + test %eax, %eax + jnz L(ret_vec_x1) + + test %rdx, %rdx + jz L(continue_second_vector) + + /* Check whether we can copy full vector. */ + cmp $CHAR_PER_VEC, %rdx + jbe L(page_cross_small_vec_copy) + /* Copy first vector. */ + VMOVU %VMM(1), (%rdi) + sub $CHAR_PER_VEC, %rdx + +L(continue_second_vector): + /* Align RSI pointer and adjust RDI based on offset. */ + mov %rsi, %rax + and $-VEC_SIZE, %rsi + sub %rsi, %rax + sub %rax, %rdi + + /* Check if string already copied N char, and RDX is 0. */ + test %rdx, %rdx + jz L(skip_copy_alignment_fix) + + /* Adjust RDX for copy alignment fix. */ +# ifdef USE_AS_WCSLCPY + shr $2, %rax +# endif + add %rax, %rdx + +L(skip_copy_alignment_fix): + /* Load second vector. */ + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x2) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(continue_third_vector) + + /* Jump below/equal(instead of below) used here, because last + copy chracter must be NULL. */ + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_second_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy second vector. */ + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + +L(continue_third_vector): + /* Load third vector. */ + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x3) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(continue_fourth_vector) + + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_third_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy third vector. */ + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi) + +L(continue_fourth_vector): + /* Load fourth vector. */ + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + vptest %VMM(2), %VMM(2) + jnz L(ret_vec_x4) + + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(loop_4x_align) + + cmp $CHAR_PER_VEC, %rdx + jbe L(partial_copy_fourth_vector) + + sub $CHAR_PER_VEC, %rdx + /* Copy fourth vector. */ + VMOVU %VMM(1), (VEC_SIZE * 3)(%rdi) + + +L(loop_4x_align): + /* Jump to loop if RSI is already 4 vector align. */ + test $(VEC_SIZE * 4 - 1), %esi + jz L(loop_4x_read) + + mov %rsi, %rcx + + /* Align RSI to 4x vector. */ + and $(VEC_SIZE * -4), %rsi + sub %rsi, %rcx + + /* Adjust RDI for RSI alignment fix. */ + sub %rcx, %rdi + + /* Jump to loop if RDX is 0. */ + test %rdx, %rdx + jz L(loop_4x_read) + +# ifdef USE_AS_WCSLCPY + shr $2, %rcx +# endif + + /* Adjust RDX for RSI alignment fix. */ + add %rcx, %rdx + jmp L(loop_4x_read) + + .p2align 4,,6 +L(loop_4x_vec): + /* Skip copy if RDX is 0. */ + test %rdx, %rdx + jz L(loop_partial_copy_return) + cmp $(CHAR_PER_VEC * 4), %rdx + jbe L(loop_partial_copy) + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) + VMOVU %VMM(3), (VEC_SIZE * 6)(%rdi) + VMOVU %VMM(4), (VEC_SIZE * 7)(%rdi) + sub $(CHAR_PER_VEC * 4), %rdx + +L(loop_partial_copy_return): + sub $(VEC_SIZE * -4), %rsi + sub $(VEC_SIZE * -4), %rdi + +L(loop_4x_read): + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rsi), %VMM(3) + VMOVA (VEC_SIZE * 7)(%rsi), %VMM(4) + VPMINU %VMM(1), %VMM(2), %VMM(5) + VPMINU %VMM(3), %VMM(4), %VMM(6) + VPMINU %VMM(5), %VMM(6), %VMM(7) + VPCMPEQ %VMM(0), %VMM(7), %VMM(7) + vptest %VMM(7), %VMM(7) + + jz L(loop_4x_vec) + + /* Check if string ends in first vector or second vector. */ + lea (VEC_SIZE * 4)(%rsi), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax +# endif + xor %r10, %r10 + VPCMPEQ %VMM(0), %VMM(5), %VMM(6) + vptest %VMM(6), %VMM(6) + jnz L(endloop) + sub $(CHAR_PER_VEC * -2), %rax + mov $(CHAR_PER_VEC * 2), %r10 + VMOVA %VMM(3), %VMM(1) + VMOVA %VMM(4), %VMM(2) + +L(endloop): + VPCMPEQ %VMM(0), %VMM(1), %VMM(1) + VPCMPEQ %VMM(0), %VMM(2), %VMM(2) + PMOVMSK %VMM(1), %rcx + PMOVMSK %VMM(2), %r9 + shlq $32, %r9 + orq %r9, %rcx + bsf %rcx, %rcx + /* Shift RCX by 2, VPMOVMSK has only byte version. */ +# ifdef USE_AS_WCSLCPY + shr $2, %rcx +# endif + /* At this point RAX has length to return. */ + add %rcx, %rax + test %rdx, %rdx + jz L(ret) + + /* Add 1 to account for NULL character in RDX comparison. */ + lea 1(%r10, %rcx), %rcx + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(loop_partial_copy): + cmp $(CHAR_PER_VEC * 2), %rdx + jbe L(loop_partial_first_half) + /* Reload first 2 vector. */ + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(2) + VMOVU %VMM(1), (VEC_SIZE * 4)(%rdi) + VMOVU %VMM(2), (VEC_SIZE * 5)(%rdi) + +L(loop_partial_first_half): + /* Go back 2 vector from last and use overlapping copy. + (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE) + (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE) + */ + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3) + VMOVU (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + xor %rdx, %rdx + vptest %VMM(7), %VMM(7) + jz L(loop_partial_copy_return) + ret + + .p2align 4 +L(page_cross): + mov %rsi, %rcx + mov %rsi, %r11 + and $-VEC_SIZE, %r11 + and $(VEC_SIZE - 1), %rcx + VMOVA (%r11), %VMM(1) + VPCMPEQ %VMM(0), %VMM(1), %VMM(2) + PMOVMSK %VMM(2), %eax + shr %cl, %eax + jz L(page_cross_continue) + +L(ret_vec_x1): + bsf %eax, %eax +# ifdef USE_AS_WCSLCPY + shr $2, %eax +# endif + /* Increment by 1 to account for NULL char. */ + lea 1(%eax), %ecx + cmp %rdx, %rcx + cmovb %rcx, %rdx + test %rdx, %rdx + jz L(ret) + +L(page_cross_small_vec_copy): + cmp $(16 / CHAR_SIZE), %rdx + jbe L(copy_8_byte_scalar) + VMOVU (%rsi), %VMM_128(1) + VMOVU -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3) + VMOVU %VMM_128(1), (%rdi) + VMOVU %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %rdx, %rdx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_8_byte_scalar): + cmp $(8 / CHAR_SIZE), %rdx + jbe L(copy_4_byte_scalar) + movq (%rsi), %r10 + movq -8(%rsi, %rdx, CHAR_SIZE), %r11 + movq %r10, (%rdi) + movq %r11, -8(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_4_byte_scalar): +# ifndef USE_AS_WCSLCPY + cmp $4, %rdx + jbe L(copy_2_byte_scalar) +# endif + movl (%rsi), %r10d + movl -4(%rsi, %rdx, CHAR_SIZE), %r11d + movl %r10d, (%rdi) + movl %r11d, -4(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +# ifndef USE_AS_WCSLCPY +L(copy_2_byte_scalar): + cmp $2, %rdx + jbe L(copy_1_byte_scalar) + movw (%rsi), %r10w + movw -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w + movw %r10w, (%rdi) + movw %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret + +L(copy_1_byte_scalar): + MOVU (%rsi), %r10b + MOVU %r10b, (%rdi) + MOVU $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_second_vector) + ret +# endif + +L(ret_vec_x2): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea VEC_SIZE(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_second_vector): + VMOVU (%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (%rdi, %rdx, CHAR_SIZE) + MOVU $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_third_vector) + +L(ret): + ret + +L(ret_vec_x3): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea (VEC_SIZE * 2)(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_third_vector): + VMOVU (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_fourth_vector) + ret + +L(ret_vec_x4): + PMOVMSK %VMM(2), %rax + bsf %rax, %rcx + /* Calculate return value. */ + lea (VEC_SIZE * 3)(%rsi, %rcx), %rax + sub %r8, %rax +# ifdef USE_AS_WCSLCPY + shr $2, %rax + shr $2, %rcx +# endif + inc %rcx + test %rdx, %rdx + jz L(ret) + cmp %rdx, %rcx + cmovb %rcx, %rdx + +L(partial_copy_fourth_vector): + VMOVU (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + MOVU $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE) + xor %edx, %edx + vptest %VMM(2), %VMM(2) + jz L(continue_fourth_vector) + ret + +END (STRLCPY) +#endif diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c new file mode 100644 index 0000000000..eee3b7b086 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c @@ -0,0 +1,25 @@ +/* strlcpy generic. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> +#if ISA_SHOULD_BUILD (1) +# define __strlcpy __strlcpy_generic +# include <string/strlcpy.c> + +#endif diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c new file mode 100644 index 0000000000..ded41fbcfb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlcpy.c @@ -0,0 +1,36 @@ +/* Multiple versions of strlcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __strlcpy __redirect_strlcpy +# include <string.h> +# undef __strlcpy + +# define SYMBOL_NAME strlcpy +# include "ifunc-strlcpy.h" + +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ()); +weak_alias (__strlcpy, strlcpy) + +# ifdef SHARED +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy) + __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy); +# endif +#endif diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S new file mode 100644 index 0000000000..dafc20ded0 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S @@ -0,0 +1,4 @@ +#define STRLCPY __wcslcpy_avx2 +#define USE_AS_WCSLCPY 1 + +#include "strlcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c new file mode 100644 index 0000000000..ffd3c0e846 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c @@ -0,0 +1,25 @@ +/* wcslcpy generic. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#include <isa-level.h> +#if ISA_SHOULD_BUILD (1) +# define __wcslcpy __wcslcpy_generic +# include <wcsmbs/wcslcpy.c> + +#endif diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c new file mode 100644 index 0000000000..371ef9626c --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslcpy.c @@ -0,0 +1,35 @@ +/* Multiple versions of wcslcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcslcpy __redirect_wcslcpy +# include <wchar.h> +# undef __wcslcpy + +# define SYMBOL_NAME wcslcpy +# include "ifunc-strlcpy.h" + +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ()); +weak_alias (__wcslcpy, wcslcpy) +# ifdef SHARED +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy) + __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy); +# endif +#endif