Message ID | 20221104201314.401813-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/5] benchtests: Make str{n}{cat|cpy} benchmarks output json | expand |
On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote: > Optimizations are: > 1. Use more overlapping stores to avoid branches. > 2. Reduce how unrolled the aligning copies are (this is more of a > code-size save, its a negative for some sizes in terms of > perf). > 3. Improve the loop a bit (similiar to what we do in strlen with > 2x vpminu + kortest instead of 3x vpminu + kmov + test). > 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the > number that are taken. > > Performance Changes: > > Times are from N = 10 runs of the benchmark suite and are > reported as geometric mean of all ratios of > New Implementation / Old Implementation. > > stpcpy-evex -> 0.922 > strcat-evex -> 0.985 > strcpy-evex -> 0.880 > > strncpy-evex -> 0.831 > stpncpy-evex -> 0.780 > > strncat-evex -> 0.958 > > Code Size Changes: > function -> Bytes New / Bytes Old -> Ratio > > strcat-evex -> 819 / 1874 -> 0.437 > strcpy-evex -> 700 / 1074 -> 0.652 > stpcpy-evex -> 735 / 1094 -> 0.672 > > strncpy-evex -> 1397 / 2611 -> 0.535 > stpncpy-evex -> 1489 / 2691 -> 0.553 > > strncat-evex -> 1184 / 2832 -> 0.418 > > Notes: > 1. Because of the significant difference between the > implementations they are split into three files. > > strcpy-evex.S -> strcpy, stpcpy, strcat > strncpy-evex.S -> strncpy > strncat-evex.S > strncat > > I couldn't find a way to merge them without making the > ifdefs incredibly difficult to follow. > > 2. All implementations can be made evex512 by including > "x86-evex512-vecs.h" at the top. > > 3. All implementations have an optional define: > `USE_EVEX_MASKED_STORE` > Setting to one uses evex-masked stores for handling short > strings. This saves code size and branches. It's disabled > for all implementations are the moment as there are some > serious drawbacks to masked stores in certain cases, but > that may be fixed on future architectures. > > Full check passes on x86-64 and build succeeds for all ISA levels w/ > and w/o multiarch. > --- > sysdeps/x86_64/multiarch/stpncpy-evex.S | 5 +- > sysdeps/x86_64/multiarch/strcat-evex.S | 291 +--- > sysdeps/x86_64/multiarch/strcat-strlen-evex.S | 110 ++ > sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 ++++++----------- > sysdeps/x86_64/multiarch/strncat-evex.S | 525 ++++++- > sysdeps/x86_64/multiarch/strncpy-evex.S | 995 ++++++++++++- > .../multiarch/strncpy-or-cat-overflow-def.h | 65 + > 7 files changed, 2100 insertions(+), 1173 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S > create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S > index 99ea76a372..3693491baa 100644 > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S > @@ -3,6 +3,5 @@ > #endif > > #define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY STPNCPY > -#include "strcpy-evex.S" > +#define STRNCPY STPNCPY > +#include "strncpy-evex.S" > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S > index 0e2df947e9..b4207b7889 100644 > --- a/sysdeps/x86_64/multiarch/strcat-evex.S > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S > @@ -1,286 +1,7 @@ > -/* strcat with 256-bit EVEX instructions. > - Copyright (C) 2021-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <isa-level.h> > - > -#if ISA_SHOULD_BUILD (4) > - > - > -# include <sysdep.h> > - > -# ifndef STRCAT > -# define STRCAT __strcat_evex > -# endif > - > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -/* zero register */ > -# define XMMZERO xmm16 > -# define YMMZERO ymm16 > -# define YMM0 ymm17 > -# define YMM1 ymm18 > - > -# define USE_AS_STRCAT > - > -/* Number of bytes in a vector register */ > -# define VEC_SIZE 32 > - > - .section .text.evex,"ax",@progbits > -ENTRY (STRCAT) > - mov %rdi, %r9 > -# ifdef USE_AS_STRNCAT > - mov %rdx, %r8 > -# endif > - > - xor %eax, %eax > - mov %edi, %ecx > - and $((VEC_SIZE * 4) - 1), %ecx > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - cmp $(VEC_SIZE * 3), %ecx > - ja L(fourth_vector_boundary) > - vpcmpb $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %edx > - test %edx, %edx > - jnz L(exit_null_on_first_vector) > - mov %rdi, %rax > - and $-VEC_SIZE, %rax > - jmp L(align_vec_size_start) > -L(fourth_vector_boundary): > - mov %rdi, %rax > - and $-VEC_SIZE, %rax > - vpcmpb $0, (%rax), %YMMZERO, %k0 > - mov $-1, %r10d > - sub %rax, %rcx > - shl %cl, %r10d > - kmovd %k0, %edx > - and %r10d, %edx > - jnz L(exit) > - > -L(align_vec_size_start): > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > - kmovd %k0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > - kmovd %k2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > - kmovd %k3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > - add $(VEC_SIZE * 4), %rax > - kmovd %k4, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > - kmovd %k2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > - kmovd %k3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > - kmovd %k4, %edx > - add $(VEC_SIZE * 4), %rax > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > - kmovd %k2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > - kmovd %k3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > - add $(VEC_SIZE * 4), %rax > - kmovd %k4, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > - kmovd %k2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > - kmovd %k3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > - add $(VEC_SIZE * 5), %rax > - kmovd %k4, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > - add $VEC_SIZE, %rax > - kmovd %k0, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > - add $VEC_SIZE, %rax > - kmovd %k0, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 > - add $VEC_SIZE, %rax > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit) > - > - add $VEC_SIZE, %rax > - > - .p2align 4 > -L(align_four_vec_loop): > - VMOVA (%rax), %YMM0 > - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 > - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 > - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 > - vpminub %YMM0, %YMM1, %YMM0 > - /* If K0 != 0, there is a null byte. */ > - vpcmpb $0, %YMM0, %YMMZERO, %k0 > - add $(VEC_SIZE * 4), %rax > - ktestd %k0, %k0 > - jz L(align_four_vec_loop) > - > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 > - sub $(VEC_SIZE * 5), %rax > - kmovd %k0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > - kmovd %k2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > - kmovd %k3, %edx > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 4), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit): > - sub %rdi, %rax > -L(exit_null_on_first_vector): > - bsf %rdx, %rdx > - add %rdx, %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_second_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $VEC_SIZE, %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_third_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 2), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_fourth_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 3), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_fifth_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 4), %rax > - > - .p2align 4 > -L(StartStrcpyPart): > - lea (%r9, %rax), %rdi > - mov %rsi, %rcx > - mov %r9, %rax /* save result */ > - > -# ifdef USE_AS_STRNCAT > - test %r8, %r8 > - jz L(ExitZero) > -# define USE_AS_STRNCPY > -# endif > - > -# include "strcpy-evex.S" > +#ifndef STRCAT > +# define STRCAT __strcat_evex > #endif > + > +#define USE_AS_STRCAT > +#define STRCPY STRCAT > +#include "strcpy-evex.S" > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > new file mode 100644 > index 0000000000..9530d7b683 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > @@ -0,0 +1,110 @@ > +/* strlen used for begining of str{n}cat using EVEX 256/512. > + Copyright (C) 2011-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +/* NOTE: This file is meant to be included by strcat-evex or > + strncat-evex and does not standalone. Before including %rdi > + must be saved in %rax. */ Since this file isn't standalone, please rename it to .h. > + > + > +/* Simple strlen implementation that ends at > + L(strcat_strlen_done). */ > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > + movq %rdi, %r8 > + andq $(VEC_SIZE * -1), %r8 > + VPCMPEQ (%r8), %VZERO, %k0 > + KMOV %k0, %VRCX > +#ifdef USE_AS_WCSCPY > + subl %r8d, %edi > + shrl $2, %edi > +#endif > + shrx %VRDI, %VRCX, %VRCX > +#ifdef USE_AS_WCSCPY > + movq %rax, %rdi > +#endif > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v0) > + > + > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 > + KMOV %k0, %VRCX > + leaq (VEC_SIZE)(%r8), %rdi > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v0) > + > + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v1) > + > + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v2) > + > + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v3) > + > + andq $-(VEC_SIZE * 4), %rdi > + .p2align 4,, 8 > +L(loop_2x_vec): > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) > + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) > + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) > + VPTESTN %VMM(1), %VMM(1), %k1 > + VPTESTN %VMM(3), %VMM(3), %k3 > + subq $(VEC_SIZE * -4), %rdi > + KORTEST %k1, %k3 > + jz L(loop_2x_vec) > + > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v0) > + > + KMOV %k1, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v1) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(bsf_and_done_v2) > + > + KMOV %k3, %VRCX > +L(bsf_and_done_v3): > + addq $VEC_SIZE, %rdi > +L(bsf_and_done_v2): > + bsf %VRCX, %VRCX > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi > + jmp L(strcat_strlen_done) > + > + .p2align 4,, 4 > +L(bsf_and_done_v1): > + addq $VEC_SIZE, %rdi > +L(bsf_and_done_v0): > + bsf %VRCX, %VRCX > +#ifdef USE_AS_WCSCPY > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +#else > + addq %rcx, %rdi > +#endif > +L(strcat_strlen_done): > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S > index 82e45ac675..1ba0195ed2 100644 > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S > @@ -1,4 +1,4 @@ > -/* strcpy with 256-bit EVEX instructions. > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. > Copyright (C) 2021-2022 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > @@ -17,990 +17,526 @@ > <https://www.gnu.org/licenses/>. */ > > #include <isa-level.h> > - > #if ISA_SHOULD_BUILD (4) > > > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > + /* Use evex-masked stores for small sizes. Turned off at the > + moment. */ > +# define USE_EVEX_MASKED_STORE 0 > + /* Use movsb in page cross case to save code size. */ > +# define USE_MOVSB_IN_PAGE_CROSS 1 > > -# ifndef STRCPY > -# define STRCPY __strcpy_evex > -# endif > +# include <sysdep.h> > > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > # endif > > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -/* Number of bytes in a vector register */ > -# ifndef VEC_SIZE > -# define VEC_SIZE 32 > +# ifndef STRCPY > +# define STRCPY __strcpy_evex > # endif > > -# define XMM2 xmm18 > -# define XMM3 xmm19 > > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > -# define YMM7 ymm23 > +# ifdef USE_AS_WCSCPY > +# define VMOVU_MASK vmovdqu32 > +# define VPMIN vpminud > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > +# define VPCMPEQ vpcmpeqd > +# define CHAR_SIZE 4 > > -# ifndef USE_AS_STRCAT > +# define REP_MOVS rep movsd > > -/* zero register */ > -# define XMMZERO xmm16 > -# define YMMZERO ymm16 > -# define YMM1 ymm17 > - > - .section .text.evex,"ax",@progbits > -ENTRY (STRCPY) > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > - test %R8_LP, %R8_LP > - jz L(ExitZero) > -# endif > - mov %rsi, %rcx > -# ifndef USE_AS_STPCPY > - mov %rdi, %rax /* save result */ > -# endif > +# define USE_WIDE_CHAR > +# else > +# define VMOVU_MASK vmovdqu8 > +# define VPMIN vpminub > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > +# define VPCMPEQ vpcmpeqb > +# define CHAR_SIZE 1 > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > +# define REP_MOVS rep movsb > # endif > > - and $((VEC_SIZE * 4) - 1), %ecx > - cmp $(VEC_SIZE * 2), %ecx > - jbe L(SourceStringAlignmentLessTwoVecSize) > - > - and $-VEC_SIZE, %rsi > - and $(VEC_SIZE - 1), %ecx > - > - vpcmpb $0, (%rsi), %YMMZERO, %k0 > - kmovd %k0, %edx > - shr %cl, %rdx > +# include "reg-macros.h" > > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - mov $VEC_SIZE, %r10 > - sub %rcx, %r10 > - cmp %r10, %r8 > -# else > - mov $(VEC_SIZE + 1), %r10 > - sub %rcx, %r10 > - cmp %r10, %r8 > -# endif > - jbe L(CopyVecSizeTailCase2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyVecSizeTail) > - > - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 > - kmovd %k1, %edx > > -# ifdef USE_AS_STRNCPY > - add $VEC_SIZE, %r10 > - cmp %r10, %r8 > - jbe L(CopyTwoVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyTwoVecSize) > - > - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ > - VMOVU %YMM2, (%rdi) > - > -/* If source address alignment != destination address alignment */ > - .p2align 4 > -L(UnalignVecSizeBoth): > - sub %rcx, %rdi > -# ifdef USE_AS_STRNCPY > - add %rcx, %r8 > - sbb %rcx, %rcx > - or %rcx, %r8 > -# endif > - mov $VEC_SIZE, %rcx > - VMOVA (%rsi, %rcx), %YMM2 > - VMOVU %YMM2, (%rdi, %rcx) > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 3), %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > +# ifdef USE_AS_STPCPY > +# define END_REG rax > # else > - jnz L(CopyVecSize) > +# define END_REG rdi, %rdx, CHAR_SIZE > # endif > > - VMOVU %YMM2, (%rdi, %rcx) > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec3) > +# ifdef USE_AS_STRCAT > +# define PAGE_ALIGN_REG edx > +# define PAGE_ALIGN_REG_64 rdx > # else > - jnz L(CopyVecSize) > +# define PAGE_ALIGN_REG eax > +# define PAGE_ALIGN_REG_64 rax > # endif > > - VMOVU %YMM3, (%rdi, %rcx) > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 > - vpcmpb $0, %YMM4, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec4) > -# else > - jnz L(CopyVecSize) > -# endif > +# define VZERO VMM(7) > +# define VZERO_128 VMM_128(7) > > - VMOVU %YMM4, (%rdi, %rcx) > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > -# else > - jnz L(CopyVecSize) > -# endif > > - VMOVU %YMM2, (%rdi, %rcx) > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > -# else > - jnz L(CopyVecSize) > -# endif > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > - VMOVU %YMM2, (%rdi, %rcx) > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > - kmovd %k0, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec3) > -# else > - jnz L(CopyVecSize) > -# endif > > - VMOVU %YMM3, (%rdi, %rcx) > - mov %rsi, %rdx > - lea VEC_SIZE(%rsi, %rcx), %rsi > - and $-(VEC_SIZE * 4), %rsi > - sub %rsi, %rdx > - sub %rdx, %rdi > -# ifdef USE_AS_STRNCPY > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 > -# endif > -L(UnalignedFourVecSizeLoop): > - VMOVA (%rsi), %YMM4 > - VMOVA VEC_SIZE(%rsi), %YMM5 > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > - vpminub %YMM5, %YMM4, %YMM2 > - vpminub %YMM7, %YMM6, %YMM3 > - vpminub %YMM2, %YMM3, %YMM2 > - /* If K7 != 0, there is a null byte. */ > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > - kmovd %k7, %edx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 4), %r8 > - jbe L(UnalignedLeaveCase2OrCase3) > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRCPY) > +# ifdef USE_AS_STRCAT > + movq %rdi, %rax > +# include "strcat-strlen-evex.S" > # endif > - test %edx, %edx > - jnz L(UnalignedFourVecSizeLeave) > - > -L(UnalignedFourVecSizeLoop_start): > - add $(VEC_SIZE * 4), %rdi > - add $(VEC_SIZE * 4), %rsi > - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) > - VMOVA (%rsi), %YMM4 > - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) > - VMOVA VEC_SIZE(%rsi), %YMM5 > - vpminub %YMM5, %YMM4, %YMM2 > - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > - VMOVU %YMM7, -VEC_SIZE(%rdi) > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > - vpminub %YMM7, %YMM6, %YMM3 > - vpminub %YMM2, %YMM3, %YMM2 > - /* If K7 != 0, there is a null byte. */ > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > - kmovd %k7, %edx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 4), %r8 > - jbe L(UnalignedLeaveCase2OrCase3) > + > + movl %esi, %PAGE_ALIGN_REG > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG > + ja L(page_cross) > +L(page_cross_continue): > + VMOVU (%rsi), %VMM(0) > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + movq %rdi, %rax > # endif > - test %edx, %edx > - jz L(UnalignedFourVecSizeLoop_start) > > -L(UnalignedFourVecSizeLeave): > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > - kmovd %k1, %edx > - test %edx, %edx > - jnz L(CopyVecSizeUnaligned_0) > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > - kmovd %k2, %ecx > - test %ecx, %ecx > - jnz L(CopyVecSizeUnaligned_16) > + /* Two short string implementations. One with traditional > + branching approach and one with masked instructions (which > + have potential for dramatically bad perf if dst splits a > + page and is not in the TLB). */ > +# if USE_EVEX_MASKED_STORE > + VPTEST %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WCSCPY > + subl $((1 << CHAR_PER_VEC)- 1), %VRCX > +# else > + inc %VRCX > +# endif > + jz L(more_1x_vec) > + KMOV %VRCX, %k1 > + KXOR %k0, %k1, %k1 > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > - kmovd %k3, %edx > - test %edx, %edx > - jnz L(CopyVecSizeUnaligned_32) > - > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > - kmovd %k4, %ecx > - bsf %ecx, %edx > - VMOVU %YMM4, (%rdi) > - VMOVU %YMM5, VEC_SIZE(%rdi) > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax > -# endif > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > - add $(VEC_SIZE - 1), %r8 > - sub %rdx, %r8 > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - add $(VEC_SIZE * 3), %rsi > - add $(VEC_SIZE * 3), %rdi > - jmp L(CopyVecSizeExit) > -# endif > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > -/* If source address alignment == destination address alignment */ > +# ifdef USE_AS_STPCPY > + bsf %VRCX, %VRCX > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > +# endif > + ret > > -L(SourceStringAlignmentLessTwoVecSize): > - VMOVU (%rsi), %YMM3 > - VMOVU VEC_SIZE(%rsi), %YMM2 > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > - kmovd %k0, %edx > +# else > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jz L(more_1x_vec) > > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - cmp $VEC_SIZE, %r8 > + xorl %edx, %edx > + bsf %VRCX, %VRDX > +# ifdef USE_AS_STPCPY > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# endif > + > + /* Use mask bits in rcx to detect which copy we need. If the low > + mask is zero then there must be a bit set in the upper half. > + I.e if rcx != 0 and ecx == 0, then match must be upper 32 > + bits so we use L(copy_32_63). */ > +# if VEC_SIZE == 64 > +# ifdef USE_AS_WCSCPY > + testb %cl, %cl > +# else > + testl %ecx, %ecx > +# endif > + jz L(copy_32_63) > +# endif > + > +# ifdef USE_AS_WCSCPY > + testb $0xf, %cl > # else > - cmp $(VEC_SIZE + 1), %r8 > + testw %cx, %cx > # endif > - jbe L(CopyVecSizeTail1Case2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyVecSizeTail1) > + jz L(copy_16_31) > > - VMOVU %YMM3, (%rdi) > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %edx > > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - cmp $(VEC_SIZE * 2), %r8 > +# ifdef USE_AS_WCSCPY > + testb $0x3, %cl > # else > - cmp $((VEC_SIZE * 2) + 1), %r8 > + testb %cl, %cl > # endif > - jbe L(CopyTwoVecSize1Case2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyTwoVecSize1) > - > - and $-VEC_SIZE, %rsi > - and $(VEC_SIZE - 1), %ecx > - jmp L(UnalignVecSizeBoth) > + jz L(copy_8_15) > > -/*------End of main part with loops---------------------*/ > > -/* Case1 */ > +# ifdef USE_AS_WCSCPY > + vmovd %VMM_128(0), (%rdi) > + /* No need to copy, we know its zero. */ > + movl $0, (%END_REG) > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) > - .p2align 4 > -L(CopyVecSize): > - add %rcx, %rdi > -# endif > -L(CopyVecSizeTail): > - add %rcx, %rsi > -L(CopyVecSizeTail1): > - bsf %edx, %edx > -L(CopyVecSizeExit): > - cmp $32, %edx > - jae L(Exit32_63) > - cmp $16, %edx > - jae L(Exit16_31) > - cmp $8, %edx > - jae L(Exit8_15) > - cmp $4, %edx > - jae L(Exit4_7) > - cmp $3, %edx > - je L(Exit3) > - cmp $1, %edx > - ja L(Exit2) > - je L(Exit1) > - movb $0, (%rdi) > -# ifdef USE_AS_STPCPY > - lea (%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $1, %r8 > - lea 1(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > -# endif > ret > +# else > > - .p2align 4 > -L(CopyTwoVecSize1): > - add $VEC_SIZE, %rsi > - add $VEC_SIZE, %rdi > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $VEC_SIZE, %r8 > -# endif > - jmp L(CopyVecSizeTail1) > - > - .p2align 4 > -L(CopyTwoVecSize): > - bsf %edx, %edx > - add %rcx, %rsi > - add $VEC_SIZE, %edx > - sub %ecx, %edx > - jmp L(CopyVecSizeExit) > - > - .p2align 4 > -L(CopyVecSizeUnaligned_0): > - bsf %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > - VMOVU %YMM4, (%rdi) > - add $((VEC_SIZE * 4) - 1), %r8 > - sub %rdx, %r8 > - lea 1(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - jmp L(CopyVecSizeExit) > -# endif > + testb $0x7, %cl > + jz L(copy_4_7) > > - .p2align 4 > -L(CopyVecSizeUnaligned_16): > - bsf %ecx, %edx > - VMOVU %YMM4, (%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea VEC_SIZE(%rdi, %rdx), %rax > -# endif > - VMOVU %YMM5, VEC_SIZE(%rdi) > - add $((VEC_SIZE * 3) - 1), %r8 > - sub %rdx, %r8 > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - add $VEC_SIZE, %rsi > - add $VEC_SIZE, %rdi > - jmp L(CopyVecSizeExit) > -# endif > > - .p2align 4 > -L(CopyVecSizeUnaligned_32): > - bsf %edx, %edx > - VMOVU %YMM4, (%rdi) > - VMOVU %YMM5, VEC_SIZE(%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax > -# endif > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > - add $((VEC_SIZE * 2) - 1), %r8 > - sub %rdx, %r8 > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - add $(VEC_SIZE * 2), %rsi > - add $(VEC_SIZE * 2), %rdi > - jmp L(CopyVecSizeExit) > -# endif > + test %edx, %edx > + jz L(set_null_term) > > -# ifdef USE_AS_STRNCPY > -# ifndef USE_AS_STRCAT > - .p2align 4 > -L(CopyVecSizeUnalignedVec6): > - VMOVU %YMM6, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec5): > - VMOVU %YMM5, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec4): > - VMOVU %YMM4, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec3): > - VMOVU %YMM3, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > + /* NB: make this `vmovw` if support for AVX512-FP16 is added. > + */ > + vmovd %VMM_128(0), %esi > + movw %si, (%rdi) > + > + .p2align 4,, 1 > +L(set_null_term): > + /* No need to copy, we know its zero. */ > + movb $0, (%END_REG) > + ret > # endif > > -/* Case2 */ > - > - .p2align 4 > -L(CopyVecSizeCase2): > - add $VEC_SIZE, %r8 > - add %rcx, %rdi > - add %rcx, %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSizeCase2): > - add %rcx, %rsi > - bsf %edx, %edx > - add $VEC_SIZE, %edx > - sub %ecx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -L(CopyVecSizeTailCase2): > - add %rcx, %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -L(CopyVecSizeTail1Case2): > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -/* Case2 or Case3, Case3 */ > - > - .p2align 4 > -L(CopyVecSizeCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeCase2) > -L(CopyVecSizeCase3): > - add $VEC_SIZE, %r8 > - add %rcx, %rdi > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSizeCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyTwoVecSizeCase2) > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyVecSizeTailCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeTailCase2) > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSize1Case2OrCase3): > - add $VEC_SIZE, %rdi > - add $VEC_SIZE, %rsi > - sub $VEC_SIZE, %r8 > -L(CopyVecSizeTail1Case2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeTail1Case2) > - jmp L(StrncpyExit) > +# if VEC_SIZE == 64 > + .p2align 4,, 6 > +L(copy_32_63): > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > + VMOVU %VMM_256(0), (%rdi) > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > + ret > +# endif > + > + > + .p2align 4,, 6 > +L(copy_16_31): > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > + and will save code size. */ > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > + VMOVU %VMM_128(0), (%rdi) > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > + ret > + > + .p2align 4,, 8 > +L(copy_8_15): > +# ifdef USE_AS_WCSCPY > + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > +# else > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx > +# endif > + vmovq %VMM_128(0), (%rdi) > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) > + ret > # endif > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ > > - .p2align 4 > -L(Exit1): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > -# ifdef USE_AS_STPCPY > - lea 1(%rdi), %rax > +# ifndef USE_AS_WCSCPY > + .p2align 4,, 12 > +L(copy_4_7): > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > + vmovd %VMM_128(0), (%rdi) > + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) > + ret > # endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $2, %r8 > - lea 2(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > + > + > + .p2align 4,, 8 > +L(more_1x_vec): > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > + VMOVU %VMM(0), (%rdi) > # endif > - ret > + subq %rsi, %rdi > + andq $-(VEC_SIZE), %rsi > + addq %rsi, %rdi > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > - .p2align 4 > -L(Exit2): > - movzwl (%rsi), %ecx > - mov %cx, (%rdi) > - movb $0, 2(%rdi) > + /* Ideally we store after moves to minimize impact of potential > + false-dependencies. */ > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + VMOVU %VMM(0), (%rax) > +# endif > + > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x1) > + > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > + VMOVU %VMM(1), VEC_SIZE(%rdi) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > + > + VPTESTN %VMM(3), %VMM(3), %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + VPTESTN %VMM(4), %VMM(4), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x4) > + > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > + > + > + /* Align for 4x loop. */ > + subq %rsi, %rdi > + > + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE > + we covered before aligning. */ > + subq $-(VEC_SIZE * 5), %rsi > + andq $-(VEC_SIZE * 4), %rsi > + > + > + /* Load first half of the loop before entry. */ > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + KORTEST %k2, %k4 > + jnz L(loop_4x_done) > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > + > + subq $(VEC_SIZE * -4), %rsi > + > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + KORTEST %k2, %k4 > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + /* Restore rdi (%rdi). */ > + addq %rsi, %rdi > + test %VRCX, %VRCX > + jnz L(ret_vec_x0_end) > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > + > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x1) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > + /* Place L(ret_vec_x4) here to save code size. We get a > + meaningfuly benefit doing this for stpcpy. */ > + KMOV %k4, %VRDX > +L(ret_vec_x3): > + bsf %VRDX, %VRDX > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > # ifdef USE_AS_STPCPY > - lea 2(%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $3, %r8 > - lea 3(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax > # endif > +L(return_end): > ret > > - .p2align 4 > -L(Exit3): > - mov (%rsi), %edx > - mov %edx, (%rdi) > + .p2align 4,, 6 > +L(ret_vec_x0_end): > + bsf %VRCX, %VRCX > # ifdef USE_AS_STPCPY > - lea 3(%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $4, %r8 > - lea 4(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > # endif > + inc %VRCX > + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > ret > > - .p2align 4 > -L(Exit4_7): > - mov (%rsi), %ecx > - mov %ecx, (%rdi) > - mov -3(%rsi, %rdx), %ecx > - mov %ecx, -3(%rdi, %rdx) > + .p2align 4,, 8 > +L(ret_vec_x1): > + bsf %VRCX, %VRCX > + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > # endif > ret > > - .p2align 4 > -L(Exit8_15): > - mov (%rsi), %rcx > - mov -7(%rsi, %rdx), %r9 > - mov %rcx, (%rdi) > - mov %r9, -7(%rdi, %rdx) > + .p2align 4,, 4 > +L(ret_vec_x2): > + bsf %VRCX, %VRCX > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > # endif > ret > > - .p2align 4 > -L(Exit16_31): > - VMOVU (%rsi), %XMM2 > - VMOVU -15(%rsi, %rdx), %XMM3 > - VMOVU %XMM2, (%rdi) > - VMOVU %XMM3, -15(%rdi, %rdx) > + /* ret_vec_x3 reuses return code after the loop. */ > + .p2align 4,, 6 > +L(ret_vec_x4): > + bsf %VRCX, %VRCX > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > # endif > ret > > - .p2align 4 > -L(Exit32_63): > - VMOVU (%rsi), %YMM2 > - VMOVU -31(%rsi, %rdx), %YMM3 > - VMOVU %YMM2, (%rdi) > - VMOVU %YMM3, -31(%rdi, %rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > + > + .p2align 4,, 4 > +L(page_cross): > +# ifndef USE_AS_STRCAT > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > # endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + movq %rsi, %rcx > + andq $(VEC_SIZE * -1), %rcx > + > + VPCMPEQ (%rcx), %VZERO, %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WCSCPY > + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG > + shrl $2, %PAGE_ALIGN_REG > # endif > - ret > + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX > > -# ifdef USE_AS_STRNCPY > +# if USE_MOVSB_IN_PAGE_CROSS > + /* Optimizing more aggressively for space as this is very cold > + code. This saves 2x cache lines. */ > > - .p2align 4 > -L(StrncpyExit1): > - movzbl (%rsi), %edx > - mov %dl, (%rdi) > -# ifdef USE_AS_STPCPY > - lea 1(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 1(%rdi) > + /* This adds once to the later result which will get correct > + copy bounds. NB: this can never zero-out a non-zero RCX as > + to be in the page cross case rsi cannot be aligned and we > + already right-shift rcx by the misalignment. */ > + shl %VRCX > + jz L(page_cross_continue) > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + movq %rdi, %rax > # endif > - ret > + bsf %VRCX, %VRCX > + REP_MOVS > > - .p2align 4 > -L(StrncpyExit2): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > # ifdef USE_AS_STPCPY > - lea 2(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 2(%rdi) > + leaq -CHAR_SIZE(%rdi), %rax > # endif > ret > > - .p2align 4 > -L(StrncpyExit3_4): > - movzwl (%rsi), %ecx > - movzwl -2(%rsi, %r8), %edx > - mov %cx, (%rdi) > - mov %dx, -2(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - ret > > - .p2align 4 > -L(StrncpyExit5_8): > - mov (%rsi), %ecx > - mov -4(%rsi, %r8), %edx > - mov %ecx, (%rdi) > - mov %edx, -4(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - ret > +# else > + /* Check if we found zero-char before end of page. */ > + test %VRCX, %VRCX > + jz L(page_cross_continue) > > - .p2align 4 > -L(StrncpyExit9_16): > - mov (%rsi), %rcx > - mov -8(%rsi, %r8), %rdx > - mov %rcx, (%rdi) > - mov %rdx, -8(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - ret > + /* Traditional copy case, essentially same as used in non-page- > + cross case but since we can't reuse VMM(0) we need twice as > + many loads from rsi. */ > > - .p2align 4 > -L(StrncpyExit17_32): > - VMOVU (%rsi), %XMM2 > - VMOVU -16(%rsi, %r8), %XMM3 > - VMOVU %XMM2, (%rdi) > - VMOVU %XMM3, -16(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > +# ifndef USE_AS_STRCAT > + xorl %edx, %edx > # endif > - ret > - > - .p2align 4 > -L(StrncpyExit33_64): > - /* 0/32, 31/16 */ > - VMOVU (%rsi), %YMM2 > - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 > - VMOVU %YMM2, (%rdi) > - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) > + /* Dependency on rdi must already have been satisfied. */ > + bsf %VRCX, %VRDX > # ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# elif !defined USE_AS_STRCAT > + movq %rdi, %rax > # endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - ret > > - .p2align 4 > -L(StrncpyExit65): > - /* 0/32, 32/32, 64/1 */ > - VMOVU (%rsi), %YMM2 > - VMOVU 32(%rsi), %YMM3 > - mov 64(%rsi), %cl > - VMOVU %YMM2, (%rdi) > - VMOVU %YMM3, 32(%rdi) > - mov %cl, 64(%rdi) > -# ifdef USE_AS_STPCPY > - lea 65(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 65(%rdi) > +# if VEC_SIZE == 64 > +# ifdef USE_AS_WCSCPY > + testb %cl, %cl > +# else > + test %ecx, %ecx > +# endif > + jz L(page_cross_copy_32_63) > # endif > - ret > - > -# ifndef USE_AS_STRCAT > > - .p2align 4 > -L(Fill1): > - mov %dl, (%rdi) > - ret > +# ifdef USE_AS_WCSCPY > + testb $0xf, %cl > +# else > + testw %cx, %cx > +# endif > + jz L(page_cross_copy_16_31) > > - .p2align 4 > -L(Fill2): > - mov %dx, (%rdi) > - ret > +# ifdef USE_AS_WCSCPY > + testb $0x3, %cl > +# else > + testb %cl, %cl > +# endif > + jz L(page_cross_copy_8_15) > > - .p2align 4 > -L(Fill3_4): > - mov %dx, (%rdi) > - mov %dx, -2(%rdi, %r8) > +# ifdef USE_AS_WCSCPY > + movl (%rsi), %esi > + movl %esi, (%rdi) > + movl $0, (%END_REG) > ret > +# else > > - .p2align 4 > -L(Fill5_8): > - mov %edx, (%rdi) > - mov %edx, -4(%rdi, %r8) > - ret > + testb $0x7, %cl > + jz L(page_cross_copy_4_7) > > - .p2align 4 > -L(Fill9_16): > - mov %rdx, (%rdi) > - mov %rdx, -8(%rdi, %r8) > + test %edx, %edx > + jz L(page_cross_set_null_term) > + movzwl (%rsi), %ecx > + movw %cx, (%rdi) > +L(page_cross_set_null_term): > + movb $0, (%END_REG) > ret > > - .p2align 4 > -L(Fill17_32): > - VMOVU %XMMZERO, (%rdi) > - VMOVU %XMMZERO, -16(%rdi, %r8) > - ret > > - .p2align 4 > -L(CopyVecSizeUnalignedVec2): > - VMOVU %YMM2, (%rdi, %rcx) > - > - .p2align 4 > -L(CopyVecSizeVecExit): > - bsf %edx, %edx > - add $(VEC_SIZE - 1), %r8 > - add %rcx, %rdi > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > - sub %rdx, %r8 > - lea 1(%rdi, %rdx), %rdi > - > - .p2align 4 > -L(StrncpyFillTailWithZero): > - xor %edx, %edx > - sub $VEC_SIZE, %r8 > - jbe L(StrncpyFillExit) > - > - VMOVU %YMMZERO, (%rdi) > - add $VEC_SIZE, %rdi > - > - mov %rdi, %rsi > - and $(VEC_SIZE - 1), %esi > - sub %rsi, %rdi > - add %rsi, %r8 > - sub $(VEC_SIZE * 4), %r8 > - jb L(StrncpyFillLessFourVecSize) > - > -L(StrncpyFillLoopVmovdqa): > - VMOVA %YMMZERO, (%rdi) > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) > - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) > - add $(VEC_SIZE * 4), %rdi > - sub $(VEC_SIZE * 4), %r8 > - jae L(StrncpyFillLoopVmovdqa) > - > -L(StrncpyFillLessFourVecSize): > - add $(VEC_SIZE * 2), %r8 > - jl L(StrncpyFillLessTwoVecSize) > - VMOVA %YMMZERO, (%rdi) > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > - add $(VEC_SIZE * 2), %rdi > - sub $VEC_SIZE, %r8 > - jl L(StrncpyFillExit) > - VMOVA %YMMZERO, (%rdi) > - add $VEC_SIZE, %rdi > - jmp L(Fill) > - > - .p2align 4 > -L(StrncpyFillLessTwoVecSize): > - add $VEC_SIZE, %r8 > - jl L(StrncpyFillExit) > - VMOVA %YMMZERO, (%rdi) > - add $VEC_SIZE, %rdi > - jmp L(Fill) > - > - .p2align 4 > -L(StrncpyFillExit): > - add $VEC_SIZE, %r8 > -L(Fill): > - cmp $17, %r8d > - jae L(Fill17_32) > - cmp $9, %r8d > - jae L(Fill9_16) > - cmp $5, %r8d > - jae L(Fill5_8) > - cmp $3, %r8d > - jae L(Fill3_4) > - cmp $1, %r8d > - ja L(Fill2) > - je L(Fill1) > + .p2align 4,, 4 > +L(page_cross_copy_4_7): > + movl (%rsi), %ecx > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi > + movl %ecx, (%rdi) > + movl %esi, -(4 - CHAR_SIZE)(%END_REG) > ret > - > -/* end of ifndef USE_AS_STRCAT */ > # endif > > - .p2align 4 > -L(UnalignedLeaveCase2OrCase3): > - test %rdx, %rdx > - jnz L(UnalignedFourVecSizeLeaveCase2) > -L(UnalignedFourVecSizeLeaveCase3): > - lea (VEC_SIZE * 4)(%r8), %rcx > - and $-VEC_SIZE, %rcx > - add $(VEC_SIZE * 3), %r8 > - jl L(CopyVecSizeCase3) > - VMOVU %YMM4, (%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - VMOVU %YMM5, VEC_SIZE(%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > -# ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 4)(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (VEC_SIZE * 4)(%rdi) > -# endif > +# if VEC_SIZE == 64 > + .p2align 4,, 4 > +L(page_cross_copy_32_63): > + VMOVU (%rsi), %VMM_256(0) > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > + VMOVU %VMM_256(0), (%rdi) > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > ret > - > - .p2align 4 > -L(UnalignedFourVecSizeLeaveCase2): > - xor %ecx, %ecx > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > - kmovd %k1, %edx > - add $(VEC_SIZE * 3), %r8 > - jle L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec4) > -# else > - jnz L(CopyVecSize) > -# endif > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > - kmovd %k2, %edx > - VMOVU %YMM4, (%rdi) > - add $VEC_SIZE, %rcx > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec5) > -# else > - jnz L(CopyVecSize) > # endif > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > - kmovd %k3, %edx > - VMOVU %YMM5, VEC_SIZE(%rdi) > - add $VEC_SIZE, %rcx > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec6) > -# else > - jnz L(CopyVecSize) > -# endif > - > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > - kmovd %k4, %edx > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > - lea VEC_SIZE(%rdi, %rcx), %rdi > - lea VEC_SIZE(%rsi, %rcx), %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > -L(StrncpyExit): > - cmp $65, %r8d > - je L(StrncpyExit65) > - cmp $33, %r8d > - jae L(StrncpyExit33_64) > - cmp $17, %r8d > - jae L(StrncpyExit17_32) > - cmp $9, %r8d > - jae L(StrncpyExit9_16) > - cmp $5, %r8d > - jae L(StrncpyExit5_8) > - cmp $3, %r8d > - jae L(StrncpyExit3_4) > - cmp $1, %r8d > - ja L(StrncpyExit2) > - je L(StrncpyExit1) > -# ifdef USE_AS_STPCPY > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi) > -# endif > + .p2align 4,, 4 > +L(page_cross_copy_16_31): > + vmovdqu (%rsi), %xmm0 > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > + vmovdqu %xmm0, (%rdi) > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > ret > > - .p2align 4 > -L(ExitZero): > -# ifndef USE_AS_STRCAT > - mov %rdi, %rax > -# endif > + .p2align 4,, 4 > +L(page_cross_copy_8_15): > + movq (%rsi), %rcx > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > + movq %rcx, (%rdi) > + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) > ret > - > -# endif > - > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# else > -END (STRCAT) > # endif > +END(STRCPY) > #endif > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S > index 203a19bf21..d648ba5cfe 100644 > --- a/sysdeps/x86_64/multiarch/strncat-evex.S > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S > @@ -1,7 +1,520 @@ > -#ifndef STRNCAT > -# define STRNCAT __strncat_evex > -#endif > +/* {wcs|str}ncat with 256/512-bit EVEX. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > + /* Use evex-masked stores for small sizes. Turned off at the > + moment. */ > +# define USE_EVEX_MASKED_STORE 0 > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > +# ifndef STRNCAT > +# define STRNCAT __strncat_evex > +# endif > + > + > +# ifdef USE_AS_WCSCPY > +# define movNULL movl > +# define VMOVU_MASK vmovdqu32 > +# define VPMIN vpminud > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > +# define VPCMPEQ vpcmpeqd > +# define CHAR_SIZE 4 > + > +# define REP_MOVS rep movsd > + > +# define VMASK_REG VR10 > +# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst > + > +# define USE_WIDE_CHAR > +# else > +# define movNULL movb > +# define VMOVU_MASK vmovdqu8 > +# define VPMIN vpminub > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > +# define VPCMPEQ vpcmpeqb > +# define CHAR_SIZE 1 > + > +# define REP_MOVS rep movsb > + > +# define VMASK_REG VRCX > +# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst > + > +# endif > + > +# include "strncpy-or-cat-overflow-def.h" > + > +# include "reg-macros.h" > + > + > +# define VZERO VMM(7) > +# define VZERO_128 VMM_128(7) > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRNCAT) > + movq %rdi, %rax > + > + /* NB: It's safe to filter out zero-length strings WITHOUT > + setting null-term. Destination MUST be a null-terminated > + string so essentially the work is already done. */ > +# ifdef USE_AS_WCSCPY > + leaq -1(%rdx), %rcx > + shrq $56, %rcx > + jnz L(zero_len) > +# else > + test %rdx, %rdx > + jle L(zero_len) > +# endif > + > +# include "strcat-strlen-evex.S" > + > + movl %esi, %ecx > + andl $(PAGE_SIZE - 1), %ecx > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > + ja L(page_cross) > +L(page_cross_continue): > + VMOVU (%rsi), %VMM(0) > + VPTESTN %VMM(0), %VMM(0), %k0 > + > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > + <= CHAR_PER_VEC with masked instructions (which have > + potential for dramatically bad perf if dst splits a page and > + is not in the TLB). */ > +# if USE_EVEX_MASKED_STORE > + KMOV %k0, %VRCX > + FIND_FIRST_ONE (VRCX, VR8) > + cmpq %r8, %rdx > + jbe L(less_1x_vec) > + > + test %VRCX, %VRCX > + jz L(more_1x_vec) > + > + blsmsk %VRCX, %VRCX > + KMOV %VRCX, %k1 > + VMOVU_MASK %VMM(0), (%rdi){%k1} > + ret > + > +L(less_1x_vec): > + mov $-1, %VRCX > + bzhi %VRDX, %VRCX, %VRCX > + KMOV %VRCX, %k1 > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + VMOVU_MASK %VMM(0), (%rdi){%k1} > + > + ret > +# else > + KMOV %k0, %VMASK_REG > + /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf > + %VMASK_REG, %VRCX` for wcsncat. */ > + FIND_FIRST_ONE (VMASK_REG, VRCX) > + cmpq %rcx, %rdx > + jbe L(less_1x_vec) > + > + /* If there were no zero-CHARs (rcx was zero before > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > + cmpl $CHAR_PER_VEC, %ecx > + je L(more_1x_vec) > + > + movl %ecx, %edx > + > +L(less_1x_vec): > +# if VEC_SIZE == 64 > + cmpl $(32 / CHAR_SIZE), %edx > + jae L(copy_32_63) > +# endif > + > + cmpl $(16 / CHAR_SIZE), %edx > + jae L(copy_16_31) > + > + > + cmpl $(8 / CHAR_SIZE), %edx > + jae L(copy_8_15) > + > +# ifdef USE_AS_WCSCPY > + vmovd %VMM_128(0), (%rdi) > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + ret > +# else > + > + cmpl $4, %edx > + jae L(copy_4_7) > + > + movzbl (%rsi), %ecx > + cmpl $1, %edx > + jbe L(set_null_term) > + > + movzwl 1(%rsi), %esi > + movw %si, 1(%rdi) > + > + .p2align 4,, 1 > +L(set_null_term): > + movb %cl, (%rdi) > + movNULL $0, (%rdi, %rdx) > + ret > +# endif > + > +# if VEC_SIZE == 64 > + .p2align 4,, 6 > +L(copy_32_63): > + VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > + VMOVU %VMM_256(0), (%rdi) > + VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + ret > +# endif > + .p2align 4,, 6 > +L(copy_16_31): > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > + and will save code size. */ > + vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 > + VMOVU %VMM_128(0), (%rdi) > + vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + ret > + > + .p2align 4,, 2 > +L(copy_8_15): > + movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx > + vmovq %VMM_128(0), (%rdi) > + movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + ret > + > +# ifndef USE_AS_WCSCPY > + .p2align 4,, 12 > +L(copy_4_7): > + movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx > + vmovd %VMM_128(0), (%rdi) > + movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > + ret > +# endif > + > +# endif > + .p2align 4,, 4 > +L(zero_len): > +# ifdef USE_AS_WCSCPY > + test %rdx, %rdx > +# endif > + jne OVERFLOW_STRCAT > + ret > > -#define USE_AS_STRNCAT > -#define STRCAT STRNCAT > -#include "strcat-evex.S" > + .p2align 4,, 8 > +L(more_1x_vec): > + VMOVU %VMM(0), (%rdi) > + > + /* We are going to align rsi here so will need to be able to re- > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > + > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > + subq %rsi, %rdi > + andq $-(VEC_SIZE), %rsi > +L(loop_last_4x_vec): > + addq %rsi, %rdi > + subq %rsi, %rdx > +# ifdef USE_AS_WCSCPY > + shrq $2, %rdx > +# endif > + > + /* Will need this regardless. */ > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VMASK_REG > + > + cmpq $(CHAR_PER_VEC * 2), %rdx > + ja L(more_2x_vec) > + > +L(last_2x_vec): > + FIND_FIRST_ONE (VMASK_REG, VRCX) > + cmpl %ecx, %edx > + jbe L(ret_vec_x1_len) > + > + /* If there were no zero-CHARs (rcx was zero before > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > + cmpl $CHAR_PER_VEC, %ecx > + jne L(ret_vec_x1) > + > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + addl $-CHAR_PER_VEC, %edx > + bzhi %VRDX, %VRCX, %VR8 > + jz L(ret_vec_x2_len) > +L(ret_vec_x2): > + bsf %VRCX, %VRDX > +L(ret_vec_x2_len): > + VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > + ret > + > + .p2align 4,, 4 > +L(ret_vec_x1_len): > + movl %edx, %ecx > +L(ret_vec_x1): > + VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) > + VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > + VZEROUPPER_RETURN > + > + > + .p2align 4,, 8 > +L(last_4x_vec): > + addl $-(CHAR_PER_VEC * 4), %edx > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VMASK_REG > + subq $-(VEC_SIZE * 4), %rsi > + subq $-(VEC_SIZE * 4), %rdi > + cmpl $(CHAR_PER_VEC * 2), %edx > + jbe L(last_2x_vec) > + .p2align 4,, 8 > +L(more_2x_vec): > +# ifdef USE_AS_WCSCPY > + xorl %ecx, %ecx > +# endif > + bsf %VMASK_REG, %VRCX > + jnz L(ret_vec_x1) > + > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > + VPTESTN %VMM(3), %VMM(3), %k0 > + KMOV %k0, %VMASK_REG > + > + cmpq $(CHAR_PER_VEC * 4), %rdx > + ja L(more_4x_vec) > + > + /* Adjust length before going to L(ret_vec_x3_len) or > + L(ret_vec_x3). */ > + addl $(CHAR_PER_VEC * -2), %edx > + > + FIND_FIRST_ONE (VMASK_REG, VRCX) > + cmpl %ecx, %edx > + jbe L(ret_vec_x3_len) > + > + /* If there were no zero-CHARs (rcx was zero before > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > + cmpl $CHAR_PER_VEC, %ecx > + jne L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + VPTESTN %VMM(4), %VMM(4), %k0 > + KMOV %k0, %VRCX > + addl $-CHAR_PER_VEC, %edx > + bzhi %VRDX, %VRCX, %VR8 > + jz L(ret_vec_x4_len) > +L(ret_vec_x4): > + bsf %VRCX, %VRDX > +L(ret_vec_x4_len): > + VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > + ret > + > + .p2align 4,, 4 > +L(ret_vec_x3_len): > + movl %edx, %ecx > +L(ret_vec_x3): > + VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > + VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > + ret > + > + .p2align 4,, 8 > +L(more_4x_vec): > +# ifdef USE_AS_WCSCPY > + xorl %ecx, %ecx > +# endif > + bsf %VMASK_REG, %VRCX > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + VPTESTN %VMM(4), %VMM(4), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x4) > + > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > + > + /* Check if we are near the end before aligning. */ > + cmpq $(CHAR_PER_VEC * 8), %rdx > + jbe L(last_4x_vec) > + > + > + /* Add rsi to rdx (length) before aligning rsi. NB: Since we > + filtered out huge lengths this cannot overflow. */ > +# ifdef USE_AS_WCSCPY > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > +# else > + addq %rsi, %rdx > +# endif > + > + /* Subtract rsi from rdi before aligning (add back will have > + correct rdi for aligned rsi). */ > + subq %rsi, %rdi > + subq $-(VEC_SIZE * 5), %rsi > + andq $(VEC_SIZE * -4), %rsi > + > + /* Load first half of the loop before entry. */ > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + > + /* Offset rsi by VEC_SIZE so that we can jump to > + L(loop_last_4x_vec). */ > + addq $-(VEC_SIZE), %rsi > + KORTEST %k2, %k4 > + jnz L(loop_4x_done) > + > + /* Store loop end in r9. */ > + leaq -(VEC_SIZE * 5)(%rdx), %r9 > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > + > + subq $(VEC_SIZE * -4), %rsi > + cmpq %rsi, %r9 > + jbe L(loop_last_4x_vec) > + > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + KORTEST %k2, %k4 > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + /* Restore rdi (dst). */ > + addq %rsi, %rdi > + > + /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so > + test with bsf. */ > + bsf %VRCX, %VRCX > + jnz L(ret_vec_x1) > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) > + > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > + jnz L(ret_vec_x2) > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > + > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + bsf %VRCX, %VRCX > + jnz L(ret_vec_x3) > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > + > + KMOV %k4, %VRCX > + bsf %VRCX, %VRCX > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > + ret > + > + > + .p2align 4,, 4 > +L(page_cross): > + movq %rsi, %r8 > + andq $(VEC_SIZE * -1), %r8 > + VPCMPEQ (%r8), %VZERO, %k0 > + > +# ifdef USE_AS_WCSCPY > + KMOV %k0, %VR9 > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > + shrx %VRCX, %VR9, %VRCX > +# else > + KMOV %k0, %VRCX > + shrx %VRSI, %VRCX, %VRCX > +# endif > + > + subl %esi, %r8d > + andl $(VEC_SIZE - 1), %r8d > +# ifdef USE_AS_WCSCPY > + shrl $2, %r8d > +# endif > + cmpq %r8, %rdx > + jbe L(page_cross_small) > + /* Optimizing more for space as this is very cold code. This > + saves 2x cache lines. */ > + > + /* This adds once to the later result which will get correct > + copy bounds. NB: this can never zero-out a non-zero RCX as > + to be in the page cross case rsi cannot be aligned and we > + already right-shift rcx by the misalignment. */ > + shl %VRCX > + jz L(page_cross_continue) > + bsf %VRCX, %VRCX > + REP_MOVS > + ret > + > +L(page_cross_small): > + tzcnt %VRCX, %VRCX > + jz L(page_cross_setz) > + cmpl %edx, %ecx > + cmova %edx, %ecx > + > +# ifdef USE_AS_WCSCPY > + rep movsd > +# else > + rep movsb > +# endif > +L(page_cross_setz): > + movNULL $0, (%rdi) > + ret > +END(STRNCAT) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S > index 1b3426d511..49eaf4cbd9 100644 > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S > @@ -1,7 +1,990 @@ > -#ifndef STRNCPY > -# define STRNCPY __strncpy_evex > -#endif > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (4) > + > + /* Use evex-masked stores for small sizes. Turned off at the > + moment. */ > +# define USE_EVEX_MASKED_STORE 0 > + > + > +# include <sysdep.h> > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > + > +# ifndef STRNCPY > +# define STRNCPY __strncpy_evex > +# endif > + > +# ifdef USE_AS_WCSCPY > +# define VMOVU_MASK vmovdqu32 > +# define VPCMPEQ vpcmpeqd > +# define VPMIN vpminud > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > +# define CHAR_SIZE 4 > + > +# define REP_MOVS rep movsd > +# define REP_STOS rep stosl > + > +# define USE_WIDE_CHAR > + > +# else > +# define VMOVU_MASK vmovdqu8 > +# define VPCMPEQ vpcmpeqb > +# define VPMIN vpminub > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > +# define CHAR_SIZE 1 > + > +# define REP_MOVS rep movsb > +# define REP_STOS rep stosb > +# endif > + > +# include "strncpy-or-cat-overflow-def.h" > + > +# define PAGE_SIZE 4096 > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# include "reg-macros.h" > + > + > +# define VZERO VMM(7) > +# define VZERO_256 VMM_256(7) > +# define VZERO_128 VMM_128(7) > + > +# if VEC_SIZE == 64 > +# define VZERO_HALF VZERO_256 > +# else > +# define VZERO_HALF VZERO_128 > +# endif > + > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRNCPY) > + /* Filter zero length strings and very long strings. Zero > + length strings just return, very long strings are handled by > + just running rep stos{b|l} to zero set (which will almost > + certainly segfault), if that succeeds then just calling > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ > +# ifdef USE_AS_WCSCPY > + decq %rdx > + movq %rdx, %rax > + /* 56 is end of max supported address space. */ > + shr $56, %rax > + jnz L(zero_len) > +# else > + decq %rdx > + /* If the flag needs to become `jb` replace `dec` with `sub`. > + */ > + jl L(zero_len) > +# endif > + > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > + movl %esi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + VMOVU (%rsi), %VMM(0) > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + > + /* If no STPCPY just save end ahead of time. */ > +# ifndef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + > + > + cmpq $(CHAR_PER_VEC), %rdx > + > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > + <= CHAR_PER_VEC with masked instructions (which have > + potential for dramatically bad perf if dst splits a page and > + is not in the TLB). */ > +# if USE_EVEX_MASKED_STORE > + /* `jae` because length rdx is now length - 1. */ > + jae L(more_1x_vec) > + > + /* If there where multiple zero-CHAR matches in the first VEC, > + VRCX will be overset but thats fine since any oversets where > + at zero-positions anyways. */ > + > +# ifdef USE_AS_STPCPY > + tzcnt %VRCX, %VRAX > + cmpl %eax, %edx > + cmovb %edx, %eax > +# ifdef USE_AS_WCSCPY > + adcl $0, %eax > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + adcq %rdi, %rax > +# endif > +# endif > + dec %VRCX > + > + /* Zero out all non-zero CHAR's after the first zero match. */ > + KMOV %VRCX, %k1 > + > + /* Use VZERO as destination so this can be reused for > + L(zfill_less_vec) (which if jumped to by subsequent logic > + will have zerod out VZERO. */ > + VMOVU_MASK %VMM(0), %VZERO{%k1}{z} > +L(zfill_less_vec): > + /* Get mask for what we need to set. */ > + incl %edx > + mov $-1, %VRCX > + bzhi %VRDX, %VRCX, %VRCX > + KMOV %VRCX, %k1 > + VMOVU_MASK %VZERO, (%rdi){%k1} > + ret > + > + .p2align 4,, 4 > +L(zero_len): > + cmpq $-1, %rdx > + jne L(best_effort_strncpy) > + movq %rdi, %rax > + ret > + > + .p2align 4,, 8 > +L(more_1x_vec): > +# else > + /* `jb` because length rdx is now length - 1. */ > + jb L(less_1x_vec) > +# endif > + > + > + /* This may overset but thats fine because we still need to zero > + fill. */ > + VMOVU %VMM(0), (%rdi) > + > + > + /* Length must be >= CHAR_PER_VEC so match here means we must > + zero-fill. */ > + test %VRCX, %VRCX > + jnz L(zfill) > + > + > + /* We are going to align rsi here so will need to be able to re- > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > + subq %rsi, %rdi > + andq $-(VEC_SIZE), %rsi > + > +L(loop_last_4x_vec): > + addq %rsi, %rdi > + subq %rsi, %rdx > +# ifdef USE_AS_WCSCPY > + shrq $2, %rdx > +# endif > + > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VRCX > + > + /* -1 because of the `dec %rdx` earlier. */ > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > + ja L(more_2x_vec) > + > +L(last_2x_vec): > + /* This will be need to be computed no matter what. We do it > + ahead of time for CHAR_PER_VEC == 64 because we can't adjust > + the value of `tzcnt` with a shift. */ > +# if CHAR_PER_VEC == 64 > + tzcntq %rcx, %rcx > +# endif > + > + cmpl $(CHAR_PER_VEC), %edx > + jb L(ret_vec_x1_len) > + > + /* Seperate logic for CHAR_PER_VEC == 64 because we already did > + `tzcnt` on VRCX. */ > +# if CHAR_PER_VEC == 64 > + /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ > + cmpb $CHAR_PER_VEC, %cl > + jnz L(ret_vec_x1_no_bsf) > +# else > + test %VRCX, %VRCX > + jnz L(ret_vec_x1) > +# endif > + > + > + > + VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + KMOV %k0, %VRCX > + > +# if CHAR_PER_VEC < 64 > + /* This essentiallys adds CHAR_PER_VEC to computed result. */ > + shlq $CHAR_PER_VEC, %rcx > +# else > + tzcntq %rcx, %rcx > + addl $CHAR_PER_VEC, %ecx > +# endif > + > + .p2align 4,, 4 > +L(ret_vec_x1_len): > + /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has > + already been done. */ > +# if CHAR_PER_VEC < 64 > + tzcntq %rcx, %rcx > +# endif > + cmpl %ecx, %edx > + jbe L(ret_vec_x1_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > +L(ret_vec_x1_len_no_zfill_mov): > + movl %ecx, %edx > +# ifdef USE_AS_STPCPY > + /* clear flags. */ > + xorl %ecx, %ecx > +# endif > +L(ret_vec_x1_len_no_zfill): > + VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > +# else > + leal (VEC_SIZE)(%rdx), %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > + > + > + .p2align 4,, 10 > +L(ret_vec_x1): > + bsf %VRCX, %VRCX > +L(ret_vec_x1_no_bsf): > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > + subl %ecx, %edx > + cmpl $CHAR_PER_VEC, %edx > + jb L(ret_vec_x1_len_no_zfill_mov) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > + leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax > +# endif > + ret > + > + .p2align 4,, 8 > +L(last_4x_vec): > + /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl > + $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just > + using `movzbl`. */ > +# if CHAR_PER_VEC == 64 > + movzbl %dl, %edx > +# else > + andl $(CHAR_PER_VEC * 4 - 1), %edx > +# endif > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > + VPTESTN %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VRCX > + subq $-(VEC_SIZE * 4), %rsi > + subq $-(VEC_SIZE * 4), %rdi > + cmpl $(CHAR_PER_VEC * 2 - 1), %edx > + jbe L(last_2x_vec) > + .p2align 4,, 8 > +L(more_2x_vec): > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > + test %VRCX, %VRCX > + /* Must fill at least 2x VEC. */ > + jnz L(zfill_vec1) > + > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + /* Must fill at least 1x VEC. */ > + jnz L(zfill_vec2) > + > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > + VPTESTN %VMM(3), %VMM(3), %k0 > + KMOV %k0, %VRCX > + > + /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ > + cmpq $(CHAR_PER_VEC * 4 - 1), %rdx > + ja L(more_4x_vec) > + > + subl $(CHAR_PER_VEC * 3), %edx > + jb L(ret_vec_x3_len) > + > + test %VRCX, %VRCX > + jnz L(ret_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + KMOV %k0, %VRCX > + tzcnt %VRCX, %VRCX > + cmpl %ecx, %edx > + jbe L(ret_vec_x4_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > + movl %ecx, %edx > +L(ret_vec_x4_len_no_zfill): > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax > +# else > + leal (VEC_SIZE * 4 + 0)(%rdx), %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > + > + > +L(ret_vec_x3_len): > + addl $(CHAR_PER_VEC * 1), %edx > + tzcnt %VRCX, %VRCX > + cmpl %ecx, %edx > + jbe L(ret_vec_x3_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > +L(ret_vec_x3_len_no_zfill_mov): > + movl %ecx, %edx > +# ifdef USE_AS_STPCPY > + /* clear flags. */ > + xorl %ecx, %ecx > +# endif > + .p2align 4,, 4 > +L(ret_vec_x3_len_no_zfill): > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > + VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax > +# else > + leal (VEC_SIZE * 3 + 0)(%rdx), %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > + > + > + .p2align 4,, 8 > +L(ret_vec_x3): > + bsf %VRCX, %VRCX > + VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) > + subl %ecx, %edx > + jl L(ret_vec_x3_len_no_zfill_mov) > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > +# endif > + ret > + > + .p2align 4,, 8 > +L(more_4x_vec): > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > + test %VRCX, %VRCX > + jnz L(zfill_vec3) > + > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > + VPTESTN %VMM(4), %VMM(4), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(zfill_vec4) > > -#define USE_AS_STRNCPY > -#define STRCPY STRNCPY > -#include "strcpy-evex.S" > + /* Recheck length before aligning. */ > + cmpq $(CHAR_PER_VEC * 8 - 1), %rdx > + jbe L(last_4x_vec) > + > + /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ > +# ifdef USE_AS_WCSCPY > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > +# else > + addq %rsi, %rdx > +# endif > + subq %rsi, %rdi > + subq $-(VEC_SIZE * 5), %rsi > + andq $(VEC_SIZE * -4), %rsi > + > + > + /* Load first half of the loop before entry. */ > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + > + > + /* Offset rsi by VEC_SIZE so that we can jump to > + L(loop_last_4x_vec). */ > + addq $-(VEC_SIZE), %rsi > + KORTEST %k2, %k4 > + jnz L(loop_4x_done) > + > + /* Store loop end in r9. */ > + leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > + > + subq $(VEC_SIZE * -4), %rsi > + cmpq %rsi, %r9 > + jbe L(loop_last_4x_vec) > + > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPTESTN %VMM(4), %VMM(4), %k2 > + VPTESTN %VMM(6), %VMM(6), %k4 > + KORTEST %k2, %k4 > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + /* Restore rdx (length). */ > + subq %rsi, %rdx > +# ifdef USE_AS_WCSCPY > + shrq $2, %rdx > +# endif > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > + /* Restore rdi (dst). */ > + addq %rsi, %rdi > + VPTESTN %VMM(0), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(zfill_vec1) > + > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > + jnz L(zfill_vec2) > + > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > + VPTESTN %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(zfill_vec3) > + > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) > + KMOV %k4, %VRCX > + // Zfill more.... > + > + .p2align 4,, 4 > +L(zfill_vec4): > + subq $(VEC_SIZE * -2), %rdi > + addq $(CHAR_PER_VEC * -2), %rdx > +L(zfill_vec2): > + subq $(VEC_SIZE * -2), %rdi > + addq $(CHAR_PER_VEC * -1), %rdx > +L(zfill): > + /* VRCX must be non-zero. */ > + bsf %VRCX, %VRCX > + > + /* Adjust length / dst for zfill. */ > + subq %rcx, %rdx > +# ifdef USE_AS_WCSCPY > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_from_page_cross): > + > + /* From here on out its just memset(rdi, 0, rdx). */ > + cmpq $CHAR_PER_VEC, %rdx > + jb L(zfill_less_vec) > + > +L(zfill_more_1x_vec): > + VMOVU %VZERO, (%rdi) > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > + ja L(zfill_more_2x_vec) > +L(zfill_done0): > + ret > + > + /* Coming from vec1/vec2 we must be able to zfill at least 2x > + VEC. */ > + .p2align 4,, 8 > +L(zfill_vec3): > + subq $(VEC_SIZE * -2), %rdi > + addq $(CHAR_PER_VEC * -2), %rdx > + .p2align 4,, 2 > +L(zfill_vec1): > + bsfq %rcx, %rcx > + /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. > + */ > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > + subq %rcx, %rdx > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + > + > + VMOVU %VZERO, (%rdi) > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > + cmpq $(CHAR_PER_VEC * 2), %rdx > + jb L(zfill_done0) > +L(zfill_more_2x_vec): > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > + VMOVU %VZERO, (VEC_SIZE)(%rdi) > + subq $(CHAR_PER_VEC * 4 - 1), %rdx > + jbe L(zfill_done) > + > +# ifdef USE_AS_WCSCPY > + leaq (%rdi, %rdx, CHAR_SIZE), %rdx > +# else > + addq %rdi, %rdx > +# endif > + > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) > + > + > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) > + > + subq $-(VEC_SIZE * 4), %rdi > + cmpq %rdi, %rdx > + jbe L(zfill_done) > + > + /* Align rdi and zfill loop. */ > + andq $-(VEC_SIZE), %rdi > + .p2align 4,, 12 > +L(zfill_loop_4x_vec): > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) > + subq $-(VEC_SIZE * 4), %rdi > + cmpq %rdi, %rdx > + ja L(zfill_loop_4x_vec) > +L(zfill_done): > + ret > + > + > + /* Less 1x VEC case if we are not using evex masked store. */ > +# if !USE_EVEX_MASKED_STORE > + .p2align 4,, 8 > +L(copy_1x): > + /* Special case for copy 1x. It can be handled quickly and many > + buffer sizes have convenient alignment. */ > + VMOVU %VMM(0), (%rdi) > + /* If no zeros then we are done. */ > + testl %ecx, %ecx > + jz L(ret_1x_1x) > + > + /* Need to zfill, not we know that length <= CHAR_PER_VEC so we > + only handle the small case here. */ > + bsf %VRCX, %VRCX > +L(zfill_less_vec_no_bsf): > + /* Adjust length / dst then just zfill less_vec. */ > + subq %rcx, %rdx > +# ifdef USE_AS_WCSCPY > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + > +L(zfill_less_vec): > + cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx > + jb L(zfill_less_half) > + > + VMOVU %VZERO_HALF, (%rdi) > + VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + ret > +# ifdef USE_AS_STPCPY > +L(ret_1x_1x): > + leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax > + ret > +# endif > + > + > +# if VEC_SIZE == 64 > + .p2align 4,, 4 > +L(copy_32_63): > + /* Overfill to avoid branches. */ > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > + VMOVU %VMM_256(0), (%rdi) > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + > + /* We are taking advantage of the fact that to be here we must > + be writing null-term as (%rdi, %rcx) we have a byte of lee- > + way for overwriting. */ > + cmpl %ecx, %edx > + ja L(zfill_less_vec_no_bsf) > +# ifndef USE_AS_STPCPY > +L(ret_1x_1x): > +# else > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > +# endif > + > + .p2align 4,, 4 > +L(copy_16_31): > + /* Overfill to avoid branches. */ > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > + VMOVU %VMM_128(0), (%rdi) > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + cmpl %ecx, %edx > + > + /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then > + we have a larger copy block for 32-63 so this is just falls > + through to zfill 16-31. If VEC_SIZE == 32 then we check for > + full zfill of less 1x VEC. */ > +# if VEC_SIZE == 64 > + jbe L(ret_16_31) > + subl %ecx, %edx > +# ifdef USE_AS_WCSCPY > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_less_half): > +L(zfill_less_32): > + cmpl $(16 / CHAR_SIZE), %edx > + jb L(zfill_less_16) > + VMOVU %VZERO_128, (%rdi) > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > + ret > +# endif > +L(ret_16_31): > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > +# else > + /* VEC_SIZE == 32 begins. */ > + ja L(zfill_less_vec_no_bsf) > +# ifndef USE_AS_STPCPY > +L(ret_1x_1x): > +# else > +# ifdef USE_AS_WCSCPY > + adcq $0, %rdx > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# endif > + ret > +# endif > + > + > + .p2align 4,, 4 > +L(copy_8_15): > + /* Overfill to avoid branches. */ > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > + vmovq %VMM_128(0), (%rdi) > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > + cmpl %ecx, %edx > + jbe L(ret_8_15) > + subl %ecx, %edx > +# ifdef USE_AS_WCSCPY > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > +# else > + addq %rcx, %rdi > +# endif > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + .p2align 4,, 8 > +# if VEC_SIZE == 32 > +L(zfill_less_half): > +# endif > +L(zfill_less_16): > + xorl %ecx, %ecx > + cmpl $(8 / CHAR_SIZE), %edx > + jb L(zfill_less_8) > + movq %rcx, (%rdi) > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > +# ifndef USE_AS_STPCPY > +L(ret_8_15): > +# endif > + ret > + > + .p2align 4,, 8 > +L(less_1x_vec): > + je L(copy_1x) > + > + /* We will need `tzcnt` result for all other copy sizes. */ > + tzcnt %VRCX, %VRCX > +# if VEC_SIZE == 64 > + cmpl $(32 / CHAR_SIZE), %edx > + jae L(copy_32_63) > +# endif > + > + cmpl $(16 / CHAR_SIZE), %edx > + jae L(copy_16_31) > + > + cmpl $(8 / CHAR_SIZE), %edx > + jae L(copy_8_15) > +# ifdef USE_AS_WCSCPY > + testl %ecx, %ecx > + jz L(zfill_less_8_set_ret) > + > + movl (%rsi, %rdx, CHAR_SIZE), %esi > + vmovd %VMM_128(0), (%rdi) > + movl %esi, (%rdi, %rdx, CHAR_SIZE) > +# ifdef USE_AS_STPCPY > + cmpl %ecx, %edx > +L(ret_8_15): > + adcq $0, %rdx > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# endif > + ret > +L(zfill_less_8_set_ret): > + xorl %ecx, %ecx > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_less_8): > + movl %ecx, (%rdi) > + movl %ecx, (%rdi, %rdx, CHAR_SIZE) > + ret > +# else > + cmpl $3, %edx > + jb L(copy_0_3) > + /* Overfill to avoid branches. */ > + movl -3(%rsi, %rdx), %esi > + vmovd %VMM_128(0), (%rdi) > + movl %esi, -3(%rdi, %rdx) > + cmpl %ecx, %edx > + jbe L(ret_4_7) > + subq %rcx, %rdx > + addq %rcx, %rdi > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + xorl %ecx, %ecx > + .p2align 4,, 8 > +L(zfill_less_8): > + cmpl $3, %edx > + jb L(zfill_less_3) > + movl %ecx, (%rdi) > + movl %ecx, -3(%rdi, %rdx) > +# ifdef USE_AS_STPCPY > + ret > +# endif > + > +L(ret_4_7): > +# ifdef USE_AS_STPCPY > +L(ret_8_15): > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > + ret > + > + .p2align 4,, 4 > +L(zfill_less_3): > + testl %edx, %edx > + jz L(zfill_1) > + movw %cx, (%rdi) > +L(zfill_1): > + movb %cl, (%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > +L(copy_0_3): > + vmovd %VMM_128(0), %r8d > + testl %edx, %edx > + jz L(copy_1) > + movw %r8w, (%rdi) > + cmpl %ecx, %edx > + ja L(zfill_from_1) > + movzbl (%rsi, %rdx), %r8d > +# ifdef USE_AS_STPCPY > + movl %edx, %eax > + adcq %rdi, %rax > + movb %r8b, (%rdi, %rdx) > + ret > +# endif > + > +L(copy_1): > +# ifdef USE_AS_STPCPY > + movl %edx, %eax > + cmpl %ecx, %edx > + adcq %rdi, %rax > +# endif > +# ifdef USE_AS_WCSCPY > + vmovd %VMM_128(0), (%rdi) > +# else > + movb %r8b, (%rdi, %rdx) > +# endif > + ret > +# endif > + > + > +# ifndef USE_AS_WCSCPY > + .p2align 4,, 8 > +L(zfill_from_1): > +# ifdef USE_AS_STPCPY > + leaq (%rdi, %rcx), %rax > +# endif > + movw $0, -1(%rdi, %rdx) > + ret > +# endif > + > + .p2align 4,, 4 > +L(zero_len): > + incq %rdx > + jne L(best_effort_strncpy) > + movq %rdi, %rax > + ret > +# endif > + > + > + .p2align 4,, 4 > + .p2align 6,, 8 > +L(page_cross): > + movq %rsi, %rax > + andq $(VEC_SIZE * -1), %rax > + VPCMPEQ (%rax), %VZERO, %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WCSCPY > + movl %esi, %r8d > + shrl $2, %r8d > + andl $(CHAR_PER_VEC - 1), %r8d > + shrx %VR8, %VRCX, %VRCX > +# else > + shrx %VRSI, %VRCX, %VRCX > +# endif > + > + /* Compute amount of bytes we checked. */ > + subl %esi, %eax > + andl $(VEC_SIZE - 1), %eax > +# ifdef USE_AS_WCSCPY > + shrl $2, %eax > +# endif > + > + /* If rax > rdx then we are finishing the copy at the end of the > + page. */ > + cmpq %rax, %rdx > + jb L(page_cross_small) > + > + > + /* If rcx is non-zero then continue. */ > + test %VRCX, %VRCX > + jz L(page_cross_continue) > + > + /* We found zero-CHAR so need to copy then zfill (we know we > + didn't cover all of length here). */ > + bsf %VRCX, %VRCX > +L(movsb_and_zfill): > + incl %ecx > + subq %rcx, %rdx > +# ifdef USE_AS_STPCPY > + leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > +# else > + movq %rdi, %rax > +# endif > + > + REP_MOVS > +# ifdef USE_AS_WCSCPY > + movl $0, (%rdi) > +# else > + movb $0, (%rdi) > +# endif > + jmp L(zfill_from_page_cross) > + > +L(page_cross_small): > + tzcnt %VRCX, %VRCX > + cmpl %ecx, %edx > + jbe L(page_cross_copy_only) > + > + /* Do a zfill of the tail before copying. */ > + movq %rdi, %r9 > + xorl %eax, %eax > + > + movl %ecx, %r8d > + > + subl %ecx, %edx > + leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > + movl %edx, %ecx > + REP_STOS > + movq %r9, %rdi > + movl %r8d, %edx > +L(page_cross_copy_only): > + leal 1(%rdx), %ecx > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + adcl $0, %edx > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# else > + movq %rdi, %rax > +# endif > + REP_MOVS > + ret > + > + > +L(best_effort_strncpy): > + movq %rdx, %rcx > + xorl %eax, %eax > + movq %rdi, %r8 > + /* The length is >= 2^63. We very much so expect to segfault at > + rep stos. If that doesn't happen then just strcpy to finish. > + */ > + REP_STOS > + movq %r8, %rdi > + jmp OVERFLOW_STRCPY > +END(STRNCPY) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > new file mode 100644 > index 0000000000..d5ff4cbe50 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h Please add a copyright notice. > @@ -0,0 +1,65 @@ > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1 > + > +#if defined USE_MULTIARCH && IS_IN(libc) > +# define UNDERSCORES __ > +# ifdef USE_WITH_SSE2 > +# define ISA_EXT _sse2 > +# elif defined USE_WITH_AVX > +# ifdef USE_WITH_RTM > +# define ISA_EXT _avx_rtm > +# else > +# define ISA_EXT _avx > +# endif > +# elif defined USE_WITH_AVX2 Do we have a function with both AVX and AVX2 versions? If not, should keep just 1. > +# ifdef USE_WITH_RTM > +# define ISA_EXT _avx2_rtm > +# else > +# define ISA_EXT _avx2 > +# endif > + > +# elif defined USE_WITH_EVEX256 > +# define ISA_EXT _evex > +# elif defined USE_WITH_EVEX512 > +# define ISA_EXT _evex512 > +# endif > +#else > +# define UNDERSCORES > +# define ISA_EXT > +#endif > + > +#ifdef USE_AS_WCSCPY > +# define STRCPY_PREFIX wc > +# define STRCAT_PREFIX wcs > +# ifdef USE_AS_STPCPY > +# define STRCPY_POSTFIX pcpy > +# else > +# define STRCPY_POSTFIX scpy > +# endif > +#else > +# define STRCPY_PREFIX st > +# define STRCAT_PREFIX str > +# ifdef USE_AS_STPCPY > +# define STRCPY_POSTFIX pcpy > +# else > +# define STRCPY_POSTFIX rcpy > +# endif > +#endif > +#define STRCAT_POSTFIX cat > + > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \ > + underscores##prefix##postfix##ext > + > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__) > + > +#ifndef OVERFLOW_STRCPY > +# define OVERFLOW_STRCPY \ > + OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT) > +#endif > + > +#ifndef OVERFLOW_STRCAT > +# define OVERFLOW_STRCAT \ > + OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT) > +#endif > + > +#endif > -- > 2.34.1 > H.J.
On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote: > > Optimizations are: > > 1. Use more overlapping stores to avoid branches. > > 2. Reduce how unrolled the aligning copies are (this is more of a > > code-size save, its a negative for some sizes in terms of > > perf). > > 3. Improve the loop a bit (similiar to what we do in strlen with > > 2x vpminu + kortest instead of 3x vpminu + kmov + test). > > 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the > > number that are taken. > > > > Performance Changes: > > > > Times are from N = 10 runs of the benchmark suite and are > > reported as geometric mean of all ratios of > > New Implementation / Old Implementation. > > > > stpcpy-evex -> 0.922 > > strcat-evex -> 0.985 > > strcpy-evex -> 0.880 > > > > strncpy-evex -> 0.831 > > stpncpy-evex -> 0.780 > > > > strncat-evex -> 0.958 > > > > Code Size Changes: > > function -> Bytes New / Bytes Old -> Ratio > > > > strcat-evex -> 819 / 1874 -> 0.437 > > strcpy-evex -> 700 / 1074 -> 0.652 > > stpcpy-evex -> 735 / 1094 -> 0.672 > > > > strncpy-evex -> 1397 / 2611 -> 0.535 > > stpncpy-evex -> 1489 / 2691 -> 0.553 > > > > strncat-evex -> 1184 / 2832 -> 0.418 > > > > Notes: > > 1. Because of the significant difference between the > > implementations they are split into three files. > > > > strcpy-evex.S -> strcpy, stpcpy, strcat > > strncpy-evex.S -> strncpy > > strncat-evex.S > strncat > > > > I couldn't find a way to merge them without making the > > ifdefs incredibly difficult to follow. > > > > 2. All implementations can be made evex512 by including > > "x86-evex512-vecs.h" at the top. > > > > 3. All implementations have an optional define: > > `USE_EVEX_MASKED_STORE` > > Setting to one uses evex-masked stores for handling short > > strings. This saves code size and branches. It's disabled > > for all implementations are the moment as there are some > > serious drawbacks to masked stores in certain cases, but > > that may be fixed on future architectures. > > > > Full check passes on x86-64 and build succeeds for all ISA levels w/ > > and w/o multiarch. > > --- > > sysdeps/x86_64/multiarch/stpncpy-evex.S | 5 +- > > sysdeps/x86_64/multiarch/strcat-evex.S | 291 +--- > > sysdeps/x86_64/multiarch/strcat-strlen-evex.S | 110 ++ > > sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 ++++++----------- > > sysdeps/x86_64/multiarch/strncat-evex.S | 525 ++++++- > > sysdeps/x86_64/multiarch/strncpy-evex.S | 995 ++++++++++++- > > .../multiarch/strncpy-or-cat-overflow-def.h | 65 + > > 7 files changed, 2100 insertions(+), 1173 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > index 99ea76a372..3693491baa 100644 > > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S > > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > @@ -3,6 +3,5 @@ > > #endif > > > > #define USE_AS_STPCPY > > -#define USE_AS_STRNCPY > > -#define STRCPY STPNCPY > > -#include "strcpy-evex.S" > > +#define STRNCPY STPNCPY > > +#include "strncpy-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S > > index 0e2df947e9..b4207b7889 100644 > > --- a/sysdeps/x86_64/multiarch/strcat-evex.S > > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S > > @@ -1,286 +1,7 @@ > > -/* strcat with 256-bit EVEX instructions. > > - Copyright (C) 2021-2022 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <https://www.gnu.org/licenses/>. */ > > - > > -#include <isa-level.h> > > - > > -#if ISA_SHOULD_BUILD (4) > > - > > - > > -# include <sysdep.h> > > - > > -# ifndef STRCAT > > -# define STRCAT __strcat_evex > > -# endif > > - > > -# define VMOVU vmovdqu64 > > -# define VMOVA vmovdqa64 > > - > > -/* zero register */ > > -# define XMMZERO xmm16 > > -# define YMMZERO ymm16 > > -# define YMM0 ymm17 > > -# define YMM1 ymm18 > > - > > -# define USE_AS_STRCAT > > - > > -/* Number of bytes in a vector register */ > > -# define VEC_SIZE 32 > > - > > - .section .text.evex,"ax",@progbits > > -ENTRY (STRCAT) > > - mov %rdi, %r9 > > -# ifdef USE_AS_STRNCAT > > - mov %rdx, %r8 > > -# endif > > - > > - xor %eax, %eax > > - mov %edi, %ecx > > - and $((VEC_SIZE * 4) - 1), %ecx > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > - cmp $(VEC_SIZE * 3), %ecx > > - ja L(fourth_vector_boundary) > > - vpcmpb $0, (%rdi), %YMMZERO, %k0 > > - kmovd %k0, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_first_vector) > > - mov %rdi, %rax > > - and $-VEC_SIZE, %rax > > - jmp L(align_vec_size_start) > > -L(fourth_vector_boundary): > > - mov %rdi, %rax > > - and $-VEC_SIZE, %rax > > - vpcmpb $0, (%rax), %YMMZERO, %k0 > > - mov $-1, %r10d > > - sub %rax, %rcx > > - shl %cl, %r10d > > - kmovd %k0, %edx > > - and %r10d, %edx > > - jnz L(exit) > > - > > -L(align_vec_size_start): > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > - kmovd %k0, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_second_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_third_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > - kmovd %k2, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fourth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > - kmovd %k3, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fifth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > - add $(VEC_SIZE * 4), %rax > > - kmovd %k4, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_second_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_third_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > - kmovd %k2, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fourth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > - kmovd %k3, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fifth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > - kmovd %k4, %edx > > - add $(VEC_SIZE * 4), %rax > > - test %edx, %edx > > - jnz L(exit_null_on_second_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_third_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > - kmovd %k2, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fourth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > - kmovd %k3, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fifth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > - add $(VEC_SIZE * 4), %rax > > - kmovd %k4, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_second_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_third_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > - kmovd %k2, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fourth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > - kmovd %k3, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fifth_vector) > > - > > - test $((VEC_SIZE * 4) - 1), %rax > > - jz L(align_four_vec_loop) > > - > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > - add $(VEC_SIZE * 5), %rax > > - kmovd %k4, %edx > > - test %edx, %edx > > - jnz L(exit) > > - > > - test $((VEC_SIZE * 4) - 1), %rax > > - jz L(align_four_vec_loop) > > - > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > - add $VEC_SIZE, %rax > > - kmovd %k0, %edx > > - test %edx, %edx > > - jnz L(exit) > > - > > - test $((VEC_SIZE * 4) - 1), %rax > > - jz L(align_four_vec_loop) > > - > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > - add $VEC_SIZE, %rax > > - kmovd %k0, %edx > > - test %edx, %edx > > - jnz L(exit) > > - > > - test $((VEC_SIZE * 4) - 1), %rax > > - jz L(align_four_vec_loop) > > - > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 > > - add $VEC_SIZE, %rax > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit) > > - > > - add $VEC_SIZE, %rax > > - > > - .p2align 4 > > -L(align_four_vec_loop): > > - VMOVA (%rax), %YMM0 > > - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 > > - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 > > - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 > > - vpminub %YMM0, %YMM1, %YMM0 > > - /* If K0 != 0, there is a null byte. */ > > - vpcmpb $0, %YMM0, %YMMZERO, %k0 > > - add $(VEC_SIZE * 4), %rax > > - ktestd %k0, %k0 > > - jz L(align_four_vec_loop) > > - > > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 > > - sub $(VEC_SIZE * 5), %rax > > - kmovd %k0, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_second_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_third_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > - kmovd %k2, %edx > > - test %edx, %edx > > - jnz L(exit_null_on_fourth_vector) > > - > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > - kmovd %k3, %edx > > - sub %rdi, %rax > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - add $(VEC_SIZE * 4), %rax > > - jmp L(StartStrcpyPart) > > - > > - .p2align 4 > > -L(exit): > > - sub %rdi, %rax > > -L(exit_null_on_first_vector): > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - jmp L(StartStrcpyPart) > > - > > - .p2align 4 > > -L(exit_null_on_second_vector): > > - sub %rdi, %rax > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - add $VEC_SIZE, %rax > > - jmp L(StartStrcpyPart) > > - > > - .p2align 4 > > -L(exit_null_on_third_vector): > > - sub %rdi, %rax > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - add $(VEC_SIZE * 2), %rax > > - jmp L(StartStrcpyPart) > > - > > - .p2align 4 > > -L(exit_null_on_fourth_vector): > > - sub %rdi, %rax > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - add $(VEC_SIZE * 3), %rax > > - jmp L(StartStrcpyPart) > > - > > - .p2align 4 > > -L(exit_null_on_fifth_vector): > > - sub %rdi, %rax > > - bsf %rdx, %rdx > > - add %rdx, %rax > > - add $(VEC_SIZE * 4), %rax > > - > > - .p2align 4 > > -L(StartStrcpyPart): > > - lea (%r9, %rax), %rdi > > - mov %rsi, %rcx > > - mov %r9, %rax /* save result */ > > - > > -# ifdef USE_AS_STRNCAT > > - test %r8, %r8 > > - jz L(ExitZero) > > -# define USE_AS_STRNCPY > > -# endif > > - > > -# include "strcpy-evex.S" > > +#ifndef STRCAT > > +# define STRCAT __strcat_evex > > #endif > > + > > +#define USE_AS_STRCAT > > +#define STRCPY STRCAT > > +#include "strcpy-evex.S" > > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > new file mode 100644 > > index 0000000000..9530d7b683 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > @@ -0,0 +1,110 @@ > > +/* strlen used for begining of str{n}cat using EVEX 256/512. > > + Copyright (C) 2011-2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > + > > +/* NOTE: This file is meant to be included by strcat-evex or > > + strncat-evex and does not standalone. Before including %rdi > > + must be saved in %rax. */ > > Since this file isn't standalone, please rename it to .h. Can it be .h.S so it plays well it IDE modes? > > > + > > + > > +/* Simple strlen implementation that ends at > > + L(strcat_strlen_done). */ > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > + movq %rdi, %r8 > > + andq $(VEC_SIZE * -1), %r8 > > + VPCMPEQ (%r8), %VZERO, %k0 > > + KMOV %k0, %VRCX > > +#ifdef USE_AS_WCSCPY > > + subl %r8d, %edi > > + shrl $2, %edi > > +#endif > > + shrx %VRDI, %VRCX, %VRCX > > +#ifdef USE_AS_WCSCPY > > + movq %rax, %rdi > > +#endif > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v0) > > + > > + > > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 > > + KMOV %k0, %VRCX > > + leaq (VEC_SIZE)(%r8), %rdi > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v0) > > + > > + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v1) > > + > > + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v2) > > + > > + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v3) > > + > > + andq $-(VEC_SIZE * 4), %rdi > > + .p2align 4,, 8 > > +L(loop_2x_vec): > > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) > > + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) > > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) > > + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) > > + VPTESTN %VMM(1), %VMM(1), %k1 > > + VPTESTN %VMM(3), %VMM(3), %k3 > > + subq $(VEC_SIZE * -4), %rdi > > + KORTEST %k1, %k3 > > + jz L(loop_2x_vec) > > + > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v0) > > + > > + KMOV %k1, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v1) > > + > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(bsf_and_done_v2) > > + > > + KMOV %k3, %VRCX > > +L(bsf_and_done_v3): > > + addq $VEC_SIZE, %rdi > > +L(bsf_and_done_v2): > > + bsf %VRCX, %VRCX > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi > > + jmp L(strcat_strlen_done) > > + > > + .p2align 4,, 4 > > +L(bsf_and_done_v1): > > + addq $VEC_SIZE, %rdi > > +L(bsf_and_done_v0): > > + bsf %VRCX, %VRCX > > +#ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +#else > > + addq %rcx, %rdi > > +#endif > > +L(strcat_strlen_done): > > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S > > index 82e45ac675..1ba0195ed2 100644 > > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S > > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S > > @@ -1,4 +1,4 @@ > > -/* strcpy with 256-bit EVEX instructions. > > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. > > Copyright (C) 2021-2022 Free Software Foundation, Inc. > > This file is part of the GNU C Library. > > > > @@ -17,990 +17,526 @@ > > <https://www.gnu.org/licenses/>. */ > > > > #include <isa-level.h> > > - > > #if ISA_SHOULD_BUILD (4) > > > > > > -# ifndef USE_AS_STRCAT > > -# include <sysdep.h> > > + /* Use evex-masked stores for small sizes. Turned off at the > > + moment. */ > > +# define USE_EVEX_MASKED_STORE 0 > > + /* Use movsb in page cross case to save code size. */ > > +# define USE_MOVSB_IN_PAGE_CROSS 1 > > > > -# ifndef STRCPY > > -# define STRCPY __strcpy_evex > > -# endif > > +# include <sysdep.h> > > > > +# ifndef VEC_SIZE > > +# include "x86-evex256-vecs.h" > > # endif > > > > -# define VMOVU vmovdqu64 > > -# define VMOVA vmovdqa64 > > - > > -/* Number of bytes in a vector register */ > > -# ifndef VEC_SIZE > > -# define VEC_SIZE 32 > > +# ifndef STRCPY > > +# define STRCPY __strcpy_evex > > # endif > > > > -# define XMM2 xmm18 > > -# define XMM3 xmm19 > > > > -# define YMM2 ymm18 > > -# define YMM3 ymm19 > > -# define YMM4 ymm20 > > -# define YMM5 ymm21 > > -# define YMM6 ymm22 > > -# define YMM7 ymm23 > > +# ifdef USE_AS_WCSCPY > > +# define VMOVU_MASK vmovdqu32 > > +# define VPMIN vpminud > > +# define VPTESTN vptestnmd > > +# define VPTEST vptestmd > > +# define VPCMPEQ vpcmpeqd > > +# define CHAR_SIZE 4 > > > > -# ifndef USE_AS_STRCAT > > +# define REP_MOVS rep movsd > > > > -/* zero register */ > > -# define XMMZERO xmm16 > > -# define YMMZERO ymm16 > > -# define YMM1 ymm17 > > - > > - .section .text.evex,"ax",@progbits > > -ENTRY (STRCPY) > > -# ifdef USE_AS_STRNCPY > > - mov %RDX_LP, %R8_LP > > - test %R8_LP, %R8_LP > > - jz L(ExitZero) > > -# endif > > - mov %rsi, %rcx > > -# ifndef USE_AS_STPCPY > > - mov %rdi, %rax /* save result */ > > -# endif > > +# define USE_WIDE_CHAR > > +# else > > +# define VMOVU_MASK vmovdqu8 > > +# define VPMIN vpminub > > +# define VPTESTN vptestnmb > > +# define VPTEST vptestmb > > +# define VPCMPEQ vpcmpeqb > > +# define CHAR_SIZE 1 > > > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > +# define REP_MOVS rep movsb > > # endif > > > > - and $((VEC_SIZE * 4) - 1), %ecx > > - cmp $(VEC_SIZE * 2), %ecx > > - jbe L(SourceStringAlignmentLessTwoVecSize) > > - > > - and $-VEC_SIZE, %rsi > > - and $(VEC_SIZE - 1), %ecx > > - > > - vpcmpb $0, (%rsi), %YMMZERO, %k0 > > - kmovd %k0, %edx > > - shr %cl, %rdx > > +# include "reg-macros.h" > > > > -# ifdef USE_AS_STRNCPY > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > - mov $VEC_SIZE, %r10 > > - sub %rcx, %r10 > > - cmp %r10, %r8 > > -# else > > - mov $(VEC_SIZE + 1), %r10 > > - sub %rcx, %r10 > > - cmp %r10, %r8 > > -# endif > > - jbe L(CopyVecSizeTailCase2OrCase3) > > -# endif > > - test %edx, %edx > > - jnz L(CopyVecSizeTail) > > - > > - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 > > - kmovd %k1, %edx > > > > -# ifdef USE_AS_STRNCPY > > - add $VEC_SIZE, %r10 > > - cmp %r10, %r8 > > - jbe L(CopyTwoVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > - jnz L(CopyTwoVecSize) > > - > > - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ > > - VMOVU %YMM2, (%rdi) > > - > > -/* If source address alignment != destination address alignment */ > > - .p2align 4 > > -L(UnalignVecSizeBoth): > > - sub %rcx, %rdi > > -# ifdef USE_AS_STRNCPY > > - add %rcx, %r8 > > - sbb %rcx, %rcx > > - or %rcx, %r8 > > -# endif > > - mov $VEC_SIZE, %rcx > > - VMOVA (%rsi, %rcx), %YMM2 > > - VMOVU %YMM2, (%rdi, %rcx) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $(VEC_SIZE * 3), %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec2) > > +# ifdef USE_AS_STPCPY > > +# define END_REG rax > > # else > > - jnz L(CopyVecSize) > > +# define END_REG rdi, %rdx, CHAR_SIZE > > # endif > > > > - VMOVU %YMM2, (%rdi, %rcx) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec3) > > +# ifdef USE_AS_STRCAT > > +# define PAGE_ALIGN_REG edx > > +# define PAGE_ALIGN_REG_64 rdx > > # else > > - jnz L(CopyVecSize) > > +# define PAGE_ALIGN_REG eax > > +# define PAGE_ALIGN_REG_64 rax > > # endif > > > > - VMOVU %YMM3, (%rdi, %rcx) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 > > - vpcmpb $0, %YMM4, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec4) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > +# define VZERO VMM(7) > > +# define VZERO_128 VMM_128(7) > > > > - VMOVU %YMM4, (%rdi, %rcx) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec2) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > > > - VMOVU %YMM2, (%rdi, %rcx) > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec2) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > - VMOVU %YMM2, (%rdi, %rcx) > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > - kmovd %k0, %edx > > - add $VEC_SIZE, %rcx > > -# ifdef USE_AS_STRNCPY > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > -# endif > > - test %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec3) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > > > - VMOVU %YMM3, (%rdi, %rcx) > > - mov %rsi, %rdx > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > - and $-(VEC_SIZE * 4), %rsi > > - sub %rsi, %rdx > > - sub %rdx, %rdi > > -# ifdef USE_AS_STRNCPY > > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 > > -# endif > > -L(UnalignedFourVecSizeLoop): > > - VMOVA (%rsi), %YMM4 > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > - vpminub %YMM5, %YMM4, %YMM2 > > - vpminub %YMM7, %YMM6, %YMM3 > > - vpminub %YMM2, %YMM3, %YMM2 > > - /* If K7 != 0, there is a null byte. */ > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > - kmovd %k7, %edx > > -# ifdef USE_AS_STRNCPY > > - sub $(VEC_SIZE * 4), %r8 > > - jbe L(UnalignedLeaveCase2OrCase3) > > + .section SECTION(.text), "ax", @progbits > > +ENTRY(STRCPY) > > +# ifdef USE_AS_STRCAT > > + movq %rdi, %rax > > +# include "strcat-strlen-evex.S" > > # endif > > - test %edx, %edx > > - jnz L(UnalignedFourVecSizeLeave) > > - > > -L(UnalignedFourVecSizeLoop_start): > > - add $(VEC_SIZE * 4), %rdi > > - add $(VEC_SIZE * 4), %rsi > > - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) > > - VMOVA (%rsi), %YMM4 > > - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > - vpminub %YMM5, %YMM4, %YMM2 > > - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > - VMOVU %YMM7, -VEC_SIZE(%rdi) > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > - vpminub %YMM7, %YMM6, %YMM3 > > - vpminub %YMM2, %YMM3, %YMM2 > > - /* If K7 != 0, there is a null byte. */ > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > - kmovd %k7, %edx > > -# ifdef USE_AS_STRNCPY > > - sub $(VEC_SIZE * 4), %r8 > > - jbe L(UnalignedLeaveCase2OrCase3) > > + > > + movl %esi, %PAGE_ALIGN_REG > > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG > > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG > > + ja L(page_cross) > > +L(page_cross_continue): > > + VMOVU (%rsi), %VMM(0) > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > + movq %rdi, %rax > > # endif > > - test %edx, %edx > > - jz L(UnalignedFourVecSizeLoop_start) > > > > -L(UnalignedFourVecSizeLeave): > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > - kmovd %k1, %edx > > - test %edx, %edx > > - jnz L(CopyVecSizeUnaligned_0) > > > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > - kmovd %k2, %ecx > > - test %ecx, %ecx > > - jnz L(CopyVecSizeUnaligned_16) > > + /* Two short string implementations. One with traditional > > + branching approach and one with masked instructions (which > > + have potential for dramatically bad perf if dst splits a > > + page and is not in the TLB). */ > > +# if USE_EVEX_MASKED_STORE > > + VPTEST %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > +# ifdef USE_AS_WCSCPY > > + subl $((1 << CHAR_PER_VEC)- 1), %VRCX > > +# else > > + inc %VRCX > > +# endif > > + jz L(more_1x_vec) > > + KMOV %VRCX, %k1 > > + KXOR %k0, %k1, %k1 > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > - kmovd %k3, %edx > > - test %edx, %edx > > - jnz L(CopyVecSizeUnaligned_32) > > - > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > - kmovd %k4, %ecx > > - bsf %ecx, %edx > > - VMOVU %YMM4, (%rdi) > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > -# ifdef USE_AS_STPCPY > > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax > > -# endif > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > - add $(VEC_SIZE - 1), %r8 > > - sub %rdx, %r8 > > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi > > - jmp L(StrncpyFillTailWithZero) > > -# else > > - add $(VEC_SIZE * 3), %rsi > > - add $(VEC_SIZE * 3), %rdi > > - jmp L(CopyVecSizeExit) > > -# endif > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > > -/* If source address alignment == destination address alignment */ > > +# ifdef USE_AS_STPCPY > > + bsf %VRCX, %VRCX > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > +# endif > > + ret > > > > -L(SourceStringAlignmentLessTwoVecSize): > > - VMOVU (%rsi), %YMM3 > > - VMOVU VEC_SIZE(%rsi), %YMM2 > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > - kmovd %k0, %edx > > +# else > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jz L(more_1x_vec) > > > > -# ifdef USE_AS_STRNCPY > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > - cmp $VEC_SIZE, %r8 > > + xorl %edx, %edx > > + bsf %VRCX, %VRDX > > +# ifdef USE_AS_STPCPY > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# endif > > + > > + /* Use mask bits in rcx to detect which copy we need. If the low > > + mask is zero then there must be a bit set in the upper half. > > + I.e if rcx != 0 and ecx == 0, then match must be upper 32 > > + bits so we use L(copy_32_63). */ > > +# if VEC_SIZE == 64 > > +# ifdef USE_AS_WCSCPY > > + testb %cl, %cl > > +# else > > + testl %ecx, %ecx > > +# endif > > + jz L(copy_32_63) > > +# endif > > + > > +# ifdef USE_AS_WCSCPY > > + testb $0xf, %cl > > # else > > - cmp $(VEC_SIZE + 1), %r8 > > + testw %cx, %cx > > # endif > > - jbe L(CopyVecSizeTail1Case2OrCase3) > > -# endif > > - test %edx, %edx > > - jnz L(CopyVecSizeTail1) > > + jz L(copy_16_31) > > > > - VMOVU %YMM3, (%rdi) > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > - kmovd %k0, %edx > > > > -# ifdef USE_AS_STRNCPY > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > - cmp $(VEC_SIZE * 2), %r8 > > +# ifdef USE_AS_WCSCPY > > + testb $0x3, %cl > > # else > > - cmp $((VEC_SIZE * 2) + 1), %r8 > > + testb %cl, %cl > > # endif > > - jbe L(CopyTwoVecSize1Case2OrCase3) > > -# endif > > - test %edx, %edx > > - jnz L(CopyTwoVecSize1) > > - > > - and $-VEC_SIZE, %rsi > > - and $(VEC_SIZE - 1), %ecx > > - jmp L(UnalignVecSizeBoth) > > + jz L(copy_8_15) > > > > -/*------End of main part with loops---------------------*/ > > > > -/* Case1 */ > > +# ifdef USE_AS_WCSCPY > > + vmovd %VMM_128(0), (%rdi) > > + /* No need to copy, we know its zero. */ > > + movl $0, (%END_REG) > > > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) > > - .p2align 4 > > -L(CopyVecSize): > > - add %rcx, %rdi > > -# endif > > -L(CopyVecSizeTail): > > - add %rcx, %rsi > > -L(CopyVecSizeTail1): > > - bsf %edx, %edx > > -L(CopyVecSizeExit): > > - cmp $32, %edx > > - jae L(Exit32_63) > > - cmp $16, %edx > > - jae L(Exit16_31) > > - cmp $8, %edx > > - jae L(Exit8_15) > > - cmp $4, %edx > > - jae L(Exit4_7) > > - cmp $3, %edx > > - je L(Exit3) > > - cmp $1, %edx > > - ja L(Exit2) > > - je L(Exit1) > > - movb $0, (%rdi) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub $1, %r8 > > - lea 1(%rdi), %rdi > > - jnz L(StrncpyFillTailWithZero) > > -# endif > > ret > > +# else > > > > - .p2align 4 > > -L(CopyTwoVecSize1): > > - add $VEC_SIZE, %rsi > > - add $VEC_SIZE, %rdi > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub $VEC_SIZE, %r8 > > -# endif > > - jmp L(CopyVecSizeTail1) > > - > > - .p2align 4 > > -L(CopyTwoVecSize): > > - bsf %edx, %edx > > - add %rcx, %rsi > > - add $VEC_SIZE, %edx > > - sub %ecx, %edx > > - jmp L(CopyVecSizeExit) > > - > > - .p2align 4 > > -L(CopyVecSizeUnaligned_0): > > - bsf %edx, %edx > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > -# endif > > - VMOVU %YMM4, (%rdi) > > - add $((VEC_SIZE * 4) - 1), %r8 > > - sub %rdx, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - jmp L(StrncpyFillTailWithZero) > > -# else > > - jmp L(CopyVecSizeExit) > > -# endif > > + testb $0x7, %cl > > + jz L(copy_4_7) > > > > - .p2align 4 > > -L(CopyVecSizeUnaligned_16): > > - bsf %ecx, %edx > > - VMOVU %YMM4, (%rdi) > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > -# ifdef USE_AS_STPCPY > > - lea VEC_SIZE(%rdi, %rdx), %rax > > -# endif > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > - add $((VEC_SIZE * 3) - 1), %r8 > > - sub %rdx, %r8 > > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi > > - jmp L(StrncpyFillTailWithZero) > > -# else > > - add $VEC_SIZE, %rsi > > - add $VEC_SIZE, %rdi > > - jmp L(CopyVecSizeExit) > > -# endif > > > > - .p2align 4 > > -L(CopyVecSizeUnaligned_32): > > - bsf %edx, %edx > > - VMOVU %YMM4, (%rdi) > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > -# ifdef USE_AS_STPCPY > > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax > > -# endif > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > - add $((VEC_SIZE * 2) - 1), %r8 > > - sub %rdx, %r8 > > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi > > - jmp L(StrncpyFillTailWithZero) > > -# else > > - add $(VEC_SIZE * 2), %rsi > > - add $(VEC_SIZE * 2), %rdi > > - jmp L(CopyVecSizeExit) > > -# endif > > + test %edx, %edx > > + jz L(set_null_term) > > > > -# ifdef USE_AS_STRNCPY > > -# ifndef USE_AS_STRCAT > > - .p2align 4 > > -L(CopyVecSizeUnalignedVec6): > > - VMOVU %YMM6, (%rdi, %rcx) > > - jmp L(CopyVecSizeVecExit) > > - > > - .p2align 4 > > -L(CopyVecSizeUnalignedVec5): > > - VMOVU %YMM5, (%rdi, %rcx) > > - jmp L(CopyVecSizeVecExit) > > - > > - .p2align 4 > > -L(CopyVecSizeUnalignedVec4): > > - VMOVU %YMM4, (%rdi, %rcx) > > - jmp L(CopyVecSizeVecExit) > > - > > - .p2align 4 > > -L(CopyVecSizeUnalignedVec3): > > - VMOVU %YMM3, (%rdi, %rcx) > > - jmp L(CopyVecSizeVecExit) > > + /* NB: make this `vmovw` if support for AVX512-FP16 is added. > > + */ > > + vmovd %VMM_128(0), %esi > > + movw %si, (%rdi) > > + > > + .p2align 4,, 1 > > +L(set_null_term): > > + /* No need to copy, we know its zero. */ > > + movb $0, (%END_REG) > > + ret > > # endif > > > > -/* Case2 */ > > - > > - .p2align 4 > > -L(CopyVecSizeCase2): > > - add $VEC_SIZE, %r8 > > - add %rcx, %rdi > > - add %rcx, %rsi > > - bsf %edx, %edx > > - cmp %r8d, %edx > > - jb L(CopyVecSizeExit) > > - jmp L(StrncpyExit) > > - > > - .p2align 4 > > -L(CopyTwoVecSizeCase2): > > - add %rcx, %rsi > > - bsf %edx, %edx > > - add $VEC_SIZE, %edx > > - sub %ecx, %edx > > - cmp %r8d, %edx > > - jb L(CopyVecSizeExit) > > - jmp L(StrncpyExit) > > - > > -L(CopyVecSizeTailCase2): > > - add %rcx, %rsi > > - bsf %edx, %edx > > - cmp %r8d, %edx > > - jb L(CopyVecSizeExit) > > - jmp L(StrncpyExit) > > - > > -L(CopyVecSizeTail1Case2): > > - bsf %edx, %edx > > - cmp %r8d, %edx > > - jb L(CopyVecSizeExit) > > - jmp L(StrncpyExit) > > - > > -/* Case2 or Case3, Case3 */ > > - > > - .p2align 4 > > -L(CopyVecSizeCase2OrCase3): > > - test %rdx, %rdx > > - jnz L(CopyVecSizeCase2) > > -L(CopyVecSizeCase3): > > - add $VEC_SIZE, %r8 > > - add %rcx, %rdi > > - add %rcx, %rsi > > - jmp L(StrncpyExit) > > - > > - .p2align 4 > > -L(CopyTwoVecSizeCase2OrCase3): > > - test %rdx, %rdx > > - jnz L(CopyTwoVecSizeCase2) > > - add %rcx, %rsi > > - jmp L(StrncpyExit) > > - > > - .p2align 4 > > -L(CopyVecSizeTailCase2OrCase3): > > - test %rdx, %rdx > > - jnz L(CopyVecSizeTailCase2) > > - add %rcx, %rsi > > - jmp L(StrncpyExit) > > - > > - .p2align 4 > > -L(CopyTwoVecSize1Case2OrCase3): > > - add $VEC_SIZE, %rdi > > - add $VEC_SIZE, %rsi > > - sub $VEC_SIZE, %r8 > > -L(CopyVecSizeTail1Case2OrCase3): > > - test %rdx, %rdx > > - jnz L(CopyVecSizeTail1Case2) > > - jmp L(StrncpyExit) > > +# if VEC_SIZE == 64 > > + .p2align 4,, 6 > > +L(copy_32_63): > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > + VMOVU %VMM_256(0), (%rdi) > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > + ret > > +# endif > > + > > + > > + .p2align 4,, 6 > > +L(copy_16_31): > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > + and will save code size. */ > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > + VMOVU %VMM_128(0), (%rdi) > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > + ret > > + > > + .p2align 4,, 8 > > +L(copy_8_15): > > +# ifdef USE_AS_WCSCPY > > + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > +# else > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx > > +# endif > > + vmovq %VMM_128(0), (%rdi) > > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) > > + ret > > # endif > > > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ > > > > - .p2align 4 > > -L(Exit1): > > - movzwl (%rsi), %edx > > - mov %dx, (%rdi) > > -# ifdef USE_AS_STPCPY > > - lea 1(%rdi), %rax > > +# ifndef USE_AS_WCSCPY > > + .p2align 4,, 12 > > +L(copy_4_7): > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > + vmovd %VMM_128(0), (%rdi) > > + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) > > + ret > > # endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub $2, %r8 > > - lea 2(%rdi), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + > > + > > + .p2align 4,, 8 > > +L(more_1x_vec): > > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > + VMOVU %VMM(0), (%rdi) > > # endif > > - ret > > + subq %rsi, %rdi > > + andq $-(VEC_SIZE), %rsi > > + addq %rsi, %rdi > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > > - .p2align 4 > > -L(Exit2): > > - movzwl (%rsi), %ecx > > - mov %cx, (%rdi) > > - movb $0, 2(%rdi) > > + /* Ideally we store after moves to minimize impact of potential > > + false-dependencies. */ > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > + VMOVU %VMM(0), (%rax) > > +# endif > > + > > + VPTESTN %VMM(1), %VMM(1), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x1) > > + > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > + VMOVU %VMM(1), VEC_SIZE(%rdi) > > + > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x2) > > + > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > + > > + VPTESTN %VMM(3), %VMM(3), %k0 > > + KMOV %k0, %VRDX > > + test %VRDX, %VRDX > > + jnz L(ret_vec_x3) > > + > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + VPTESTN %VMM(4), %VMM(4), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x4) > > + > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > + > > + > > + /* Align for 4x loop. */ > > + subq %rsi, %rdi > > + > > + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE > > + we covered before aligning. */ > > + subq $-(VEC_SIZE * 5), %rsi > > + andq $-(VEC_SIZE * 4), %rsi > > + > > + > > + /* Load first half of the loop before entry. */ > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + KORTEST %k2, %k4 > > + jnz L(loop_4x_done) > > + > > + .p2align 4,, 11 > > +L(loop_4x_vec): > > + > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > + > > + subq $(VEC_SIZE * -4), %rsi > > + > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > + > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + KORTEST %k2, %k4 > > + jz L(loop_4x_vec) > > + > > +L(loop_4x_done): > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + /* Restore rdi (%rdi). */ > > + addq %rsi, %rdi > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x0_end) > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > > + > > + KMOV %k2, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x1) > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > > + > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x2) > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > > + /* Place L(ret_vec_x4) here to save code size. We get a > > + meaningfuly benefit doing this for stpcpy. */ > > + KMOV %k4, %VRDX > > +L(ret_vec_x3): > > + bsf %VRDX, %VRDX > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > # ifdef USE_AS_STPCPY > > - lea 2(%rdi), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub $3, %r8 > > - lea 3(%rdi), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax > > # endif > > +L(return_end): > > ret > > > > - .p2align 4 > > -L(Exit3): > > - mov (%rsi), %edx > > - mov %edx, (%rdi) > > + .p2align 4,, 6 > > +L(ret_vec_x0_end): > > + bsf %VRCX, %VRCX > > # ifdef USE_AS_STPCPY > > - lea 3(%rdi), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub $4, %r8 > > - lea 4(%rdi), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > # endif > > + inc %VRCX > > + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > ret > > > > - .p2align 4 > > -L(Exit4_7): > > - mov (%rsi), %ecx > > - mov %ecx, (%rdi) > > - mov -3(%rsi, %rdx), %ecx > > - mov %ecx, -3(%rdi, %rdx) > > + .p2align 4,, 8 > > +L(ret_vec_x1): > > + bsf %VRCX, %VRCX > > + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > # ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub %rdx, %r8 > > - sub $1, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > # endif > > ret > > > > - .p2align 4 > > -L(Exit8_15): > > - mov (%rsi), %rcx > > - mov -7(%rsi, %rdx), %r9 > > - mov %rcx, (%rdi) > > - mov %r9, -7(%rdi, %rdx) > > + .p2align 4,, 4 > > +L(ret_vec_x2): > > + bsf %VRCX, %VRCX > > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > # ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub %rdx, %r8 > > - sub $1, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > > # endif > > ret > > > > - .p2align 4 > > -L(Exit16_31): > > - VMOVU (%rsi), %XMM2 > > - VMOVU -15(%rsi, %rdx), %XMM3 > > - VMOVU %XMM2, (%rdi) > > - VMOVU %XMM3, -15(%rdi, %rdx) > > + /* ret_vec_x3 reuses return code after the loop. */ > > + .p2align 4,, 6 > > +L(ret_vec_x4): > > + bsf %VRCX, %VRCX > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > # ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > -# endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub %rdx, %r8 > > - sub $1, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > > # endif > > ret > > > > - .p2align 4 > > -L(Exit32_63): > > - VMOVU (%rsi), %YMM2 > > - VMOVU -31(%rsi, %rdx), %YMM3 > > - VMOVU %YMM2, (%rdi) > > - VMOVU %YMM3, -31(%rdi, %rdx) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > + > > + .p2align 4,, 4 > > +L(page_cross): > > +# ifndef USE_AS_STRCAT > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > # endif > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > - sub %rdx, %r8 > > - sub $1, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - jnz L(StrncpyFillTailWithZero) > > + movq %rsi, %rcx > > + andq $(VEC_SIZE * -1), %rcx > > + > > + VPCMPEQ (%rcx), %VZERO, %k0 > > + KMOV %k0, %VRCX > > +# ifdef USE_AS_WCSCPY > > + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG > > + shrl $2, %PAGE_ALIGN_REG > > # endif > > - ret > > + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX > > > > -# ifdef USE_AS_STRNCPY > > +# if USE_MOVSB_IN_PAGE_CROSS > > + /* Optimizing more aggressively for space as this is very cold > > + code. This saves 2x cache lines. */ > > > > - .p2align 4 > > -L(StrncpyExit1): > > - movzbl (%rsi), %edx > > - mov %dl, (%rdi) > > -# ifdef USE_AS_STPCPY > > - lea 1(%rdi), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, 1(%rdi) > > + /* This adds once to the later result which will get correct > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > + to be in the page cross case rsi cannot be aligned and we > > + already right-shift rcx by the misalignment. */ > > + shl %VRCX > > + jz L(page_cross_continue) > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > + movq %rdi, %rax > > # endif > > - ret > > + bsf %VRCX, %VRCX > > + REP_MOVS > > > > - .p2align 4 > > -L(StrncpyExit2): > > - movzwl (%rsi), %edx > > - mov %dx, (%rdi) > > # ifdef USE_AS_STPCPY > > - lea 2(%rdi), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, 2(%rdi) > > + leaq -CHAR_SIZE(%rdi), %rax > > # endif > > ret > > > > - .p2align 4 > > -L(StrncpyExit3_4): > > - movzwl (%rsi), %ecx > > - movzwl -2(%rsi, %r8), %edx > > - mov %cx, (%rdi) > > - mov %dx, -2(%rdi, %r8) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %r8), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi, %r8) > > -# endif > > - ret > > > > - .p2align 4 > > -L(StrncpyExit5_8): > > - mov (%rsi), %ecx > > - mov -4(%rsi, %r8), %edx > > - mov %ecx, (%rdi) > > - mov %edx, -4(%rdi, %r8) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %r8), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi, %r8) > > -# endif > > - ret > > +# else > > + /* Check if we found zero-char before end of page. */ > > + test %VRCX, %VRCX > > + jz L(page_cross_continue) > > > > - .p2align 4 > > -L(StrncpyExit9_16): > > - mov (%rsi), %rcx > > - mov -8(%rsi, %r8), %rdx > > - mov %rcx, (%rdi) > > - mov %rdx, -8(%rdi, %r8) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %r8), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi, %r8) > > -# endif > > - ret > > + /* Traditional copy case, essentially same as used in non-page- > > + cross case but since we can't reuse VMM(0) we need twice as > > + many loads from rsi. */ > > > > - .p2align 4 > > -L(StrncpyExit17_32): > > - VMOVU (%rsi), %XMM2 > > - VMOVU -16(%rsi, %r8), %XMM3 > > - VMOVU %XMM2, (%rdi) > > - VMOVU %XMM3, -16(%rdi, %r8) > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %r8), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi, %r8) > > +# ifndef USE_AS_STRCAT > > + xorl %edx, %edx > > # endif > > - ret > > - > > - .p2align 4 > > -L(StrncpyExit33_64): > > - /* 0/32, 31/16 */ > > - VMOVU (%rsi), %YMM2 > > - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 > > - VMOVU %YMM2, (%rdi) > > - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) > > + /* Dependency on rdi must already have been satisfied. */ > > + bsf %VRCX, %VRDX > > # ifdef USE_AS_STPCPY > > - lea (%rdi, %r8), %rax > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# elif !defined USE_AS_STRCAT > > + movq %rdi, %rax > > # endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi, %r8) > > -# endif > > - ret > > > > - .p2align 4 > > -L(StrncpyExit65): > > - /* 0/32, 32/32, 64/1 */ > > - VMOVU (%rsi), %YMM2 > > - VMOVU 32(%rsi), %YMM3 > > - mov 64(%rsi), %cl > > - VMOVU %YMM2, (%rdi) > > - VMOVU %YMM3, 32(%rdi) > > - mov %cl, 64(%rdi) > > -# ifdef USE_AS_STPCPY > > - lea 65(%rdi), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, 65(%rdi) > > +# if VEC_SIZE == 64 > > +# ifdef USE_AS_WCSCPY > > + testb %cl, %cl > > +# else > > + test %ecx, %ecx > > +# endif > > + jz L(page_cross_copy_32_63) > > # endif > > - ret > > - > > -# ifndef USE_AS_STRCAT > > > > - .p2align 4 > > -L(Fill1): > > - mov %dl, (%rdi) > > - ret > > +# ifdef USE_AS_WCSCPY > > + testb $0xf, %cl > > +# else > > + testw %cx, %cx > > +# endif > > + jz L(page_cross_copy_16_31) > > > > - .p2align 4 > > -L(Fill2): > > - mov %dx, (%rdi) > > - ret > > +# ifdef USE_AS_WCSCPY > > + testb $0x3, %cl > > +# else > > + testb %cl, %cl > > +# endif > > + jz L(page_cross_copy_8_15) > > > > - .p2align 4 > > -L(Fill3_4): > > - mov %dx, (%rdi) > > - mov %dx, -2(%rdi, %r8) > > +# ifdef USE_AS_WCSCPY > > + movl (%rsi), %esi > > + movl %esi, (%rdi) > > + movl $0, (%END_REG) > > ret > > +# else > > > > - .p2align 4 > > -L(Fill5_8): > > - mov %edx, (%rdi) > > - mov %edx, -4(%rdi, %r8) > > - ret > > + testb $0x7, %cl > > + jz L(page_cross_copy_4_7) > > > > - .p2align 4 > > -L(Fill9_16): > > - mov %rdx, (%rdi) > > - mov %rdx, -8(%rdi, %r8) > > + test %edx, %edx > > + jz L(page_cross_set_null_term) > > + movzwl (%rsi), %ecx > > + movw %cx, (%rdi) > > +L(page_cross_set_null_term): > > + movb $0, (%END_REG) > > ret > > > > - .p2align 4 > > -L(Fill17_32): > > - VMOVU %XMMZERO, (%rdi) > > - VMOVU %XMMZERO, -16(%rdi, %r8) > > - ret > > > > - .p2align 4 > > -L(CopyVecSizeUnalignedVec2): > > - VMOVU %YMM2, (%rdi, %rcx) > > - > > - .p2align 4 > > -L(CopyVecSizeVecExit): > > - bsf %edx, %edx > > - add $(VEC_SIZE - 1), %r8 > > - add %rcx, %rdi > > -# ifdef USE_AS_STPCPY > > - lea (%rdi, %rdx), %rax > > -# endif > > - sub %rdx, %r8 > > - lea 1(%rdi, %rdx), %rdi > > - > > - .p2align 4 > > -L(StrncpyFillTailWithZero): > > - xor %edx, %edx > > - sub $VEC_SIZE, %r8 > > - jbe L(StrncpyFillExit) > > - > > - VMOVU %YMMZERO, (%rdi) > > - add $VEC_SIZE, %rdi > > - > > - mov %rdi, %rsi > > - and $(VEC_SIZE - 1), %esi > > - sub %rsi, %rdi > > - add %rsi, %r8 > > - sub $(VEC_SIZE * 4), %r8 > > - jb L(StrncpyFillLessFourVecSize) > > - > > -L(StrncpyFillLoopVmovdqa): > > - VMOVA %YMMZERO, (%rdi) > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) > > - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) > > - add $(VEC_SIZE * 4), %rdi > > - sub $(VEC_SIZE * 4), %r8 > > - jae L(StrncpyFillLoopVmovdqa) > > - > > -L(StrncpyFillLessFourVecSize): > > - add $(VEC_SIZE * 2), %r8 > > - jl L(StrncpyFillLessTwoVecSize) > > - VMOVA %YMMZERO, (%rdi) > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > - add $(VEC_SIZE * 2), %rdi > > - sub $VEC_SIZE, %r8 > > - jl L(StrncpyFillExit) > > - VMOVA %YMMZERO, (%rdi) > > - add $VEC_SIZE, %rdi > > - jmp L(Fill) > > - > > - .p2align 4 > > -L(StrncpyFillLessTwoVecSize): > > - add $VEC_SIZE, %r8 > > - jl L(StrncpyFillExit) > > - VMOVA %YMMZERO, (%rdi) > > - add $VEC_SIZE, %rdi > > - jmp L(Fill) > > - > > - .p2align 4 > > -L(StrncpyFillExit): > > - add $VEC_SIZE, %r8 > > -L(Fill): > > - cmp $17, %r8d > > - jae L(Fill17_32) > > - cmp $9, %r8d > > - jae L(Fill9_16) > > - cmp $5, %r8d > > - jae L(Fill5_8) > > - cmp $3, %r8d > > - jae L(Fill3_4) > > - cmp $1, %r8d > > - ja L(Fill2) > > - je L(Fill1) > > + .p2align 4,, 4 > > +L(page_cross_copy_4_7): > > + movl (%rsi), %ecx > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi > > + movl %ecx, (%rdi) > > + movl %esi, -(4 - CHAR_SIZE)(%END_REG) > > ret > > - > > -/* end of ifndef USE_AS_STRCAT */ > > # endif > > > > - .p2align 4 > > -L(UnalignedLeaveCase2OrCase3): > > - test %rdx, %rdx > > - jnz L(UnalignedFourVecSizeLeaveCase2) > > -L(UnalignedFourVecSizeLeaveCase3): > > - lea (VEC_SIZE * 4)(%r8), %rcx > > - and $-VEC_SIZE, %rcx > > - add $(VEC_SIZE * 3), %r8 > > - jl L(CopyVecSizeCase3) > > - VMOVU %YMM4, (%rdi) > > - sub $VEC_SIZE, %r8 > > - jb L(CopyVecSizeCase3) > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > - sub $VEC_SIZE, %r8 > > - jb L(CopyVecSizeCase3) > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > - sub $VEC_SIZE, %r8 > > - jb L(CopyVecSizeCase3) > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > -# ifdef USE_AS_STPCPY > > - lea (VEC_SIZE * 4)(%rdi), %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (VEC_SIZE * 4)(%rdi) > > -# endif > > +# if VEC_SIZE == 64 > > + .p2align 4,, 4 > > +L(page_cross_copy_32_63): > > + VMOVU (%rsi), %VMM_256(0) > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > + VMOVU %VMM_256(0), (%rdi) > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > ret > > - > > - .p2align 4 > > -L(UnalignedFourVecSizeLeaveCase2): > > - xor %ecx, %ecx > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > - kmovd %k1, %edx > > - add $(VEC_SIZE * 3), %r8 > > - jle L(CopyVecSizeCase2OrCase3) > > - test %edx, %edx > > -# ifndef USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec4) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > - kmovd %k2, %edx > > - VMOVU %YMM4, (%rdi) > > - add $VEC_SIZE, %rcx > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > - test %edx, %edx > > -# ifndef USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec5) > > -# else > > - jnz L(CopyVecSize) > > # endif > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > - kmovd %k3, %edx > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > - add $VEC_SIZE, %rcx > > - sub $VEC_SIZE, %r8 > > - jbe L(CopyVecSizeCase2OrCase3) > > - test %edx, %edx > > -# ifndef USE_AS_STRCAT > > - jnz L(CopyVecSizeUnalignedVec6) > > -# else > > - jnz L(CopyVecSize) > > -# endif > > - > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > - kmovd %k4, %edx > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > - lea VEC_SIZE(%rdi, %rcx), %rdi > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > - bsf %edx, %edx > > - cmp %r8d, %edx > > - jb L(CopyVecSizeExit) > > -L(StrncpyExit): > > - cmp $65, %r8d > > - je L(StrncpyExit65) > > - cmp $33, %r8d > > - jae L(StrncpyExit33_64) > > - cmp $17, %r8d > > - jae L(StrncpyExit17_32) > > - cmp $9, %r8d > > - jae L(StrncpyExit9_16) > > - cmp $5, %r8d > > - jae L(StrncpyExit5_8) > > - cmp $3, %r8d > > - jae L(StrncpyExit3_4) > > - cmp $1, %r8d > > - ja L(StrncpyExit2) > > - je L(StrncpyExit1) > > -# ifdef USE_AS_STPCPY > > - mov %rdi, %rax > > -# endif > > -# ifdef USE_AS_STRCAT > > - movb $0, (%rdi) > > -# endif > > + .p2align 4,, 4 > > +L(page_cross_copy_16_31): > > + vmovdqu (%rsi), %xmm0 > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > + vmovdqu %xmm0, (%rdi) > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > ret > > > > - .p2align 4 > > -L(ExitZero): > > -# ifndef USE_AS_STRCAT > > - mov %rdi, %rax > > -# endif > > + .p2align 4,, 4 > > +L(page_cross_copy_8_15): > > + movq (%rsi), %rcx > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > + movq %rcx, (%rdi) > > + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) > > ret > > - > > -# endif > > - > > -# ifndef USE_AS_STRCAT > > -END (STRCPY) > > -# else > > -END (STRCAT) > > # endif > > +END(STRCPY) > > #endif > > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S > > index 203a19bf21..d648ba5cfe 100644 > > --- a/sysdeps/x86_64/multiarch/strncat-evex.S > > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S > > @@ -1,7 +1,520 @@ > > -#ifndef STRNCAT > > -# define STRNCAT __strncat_evex > > -#endif > > +/* {wcs|str}ncat with 256/512-bit EVEX. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > + /* Use evex-masked stores for small sizes. Turned off at the > > + moment. */ > > +# define USE_EVEX_MASKED_STORE 0 > > + > > +# include <sysdep.h> > > + > > +# ifndef VEC_SIZE > > +# include "x86-evex256-vecs.h" > > +# endif > > + > > +# ifndef STRNCAT > > +# define STRNCAT __strncat_evex > > +# endif > > + > > + > > +# ifdef USE_AS_WCSCPY > > +# define movNULL movl > > +# define VMOVU_MASK vmovdqu32 > > +# define VPMIN vpminud > > +# define VPTESTN vptestnmd > > +# define VPTEST vptestmd > > +# define VPCMPEQ vpcmpeqd > > +# define CHAR_SIZE 4 > > + > > +# define REP_MOVS rep movsd > > + > > +# define VMASK_REG VR10 > > +# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst > > + > > +# define USE_WIDE_CHAR > > +# else > > +# define movNULL movb > > +# define VMOVU_MASK vmovdqu8 > > +# define VPMIN vpminub > > +# define VPTESTN vptestnmb > > +# define VPTEST vptestmb > > +# define VPCMPEQ vpcmpeqb > > +# define CHAR_SIZE 1 > > + > > +# define REP_MOVS rep movsb > > + > > +# define VMASK_REG VRCX > > +# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst > > + > > +# endif > > + > > +# include "strncpy-or-cat-overflow-def.h" > > + > > +# include "reg-macros.h" > > + > > + > > +# define VZERO VMM(7) > > +# define VZERO_128 VMM_128(7) > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > + .section SECTION(.text), "ax", @progbits > > +ENTRY(STRNCAT) > > + movq %rdi, %rax > > + > > + /* NB: It's safe to filter out zero-length strings WITHOUT > > + setting null-term. Destination MUST be a null-terminated > > + string so essentially the work is already done. */ > > +# ifdef USE_AS_WCSCPY > > + leaq -1(%rdx), %rcx > > + shrq $56, %rcx > > + jnz L(zero_len) > > +# else > > + test %rdx, %rdx > > + jle L(zero_len) > > +# endif > > + > > +# include "strcat-strlen-evex.S" > > + > > + movl %esi, %ecx > > + andl $(PAGE_SIZE - 1), %ecx > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > + ja L(page_cross) > > +L(page_cross_continue): > > + VMOVU (%rsi), %VMM(0) > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > + <= CHAR_PER_VEC with masked instructions (which have > > + potential for dramatically bad perf if dst splits a page and > > + is not in the TLB). */ > > +# if USE_EVEX_MASKED_STORE > > + KMOV %k0, %VRCX > > + FIND_FIRST_ONE (VRCX, VR8) > > + cmpq %r8, %rdx > > + jbe L(less_1x_vec) > > + > > + test %VRCX, %VRCX > > + jz L(more_1x_vec) > > + > > + blsmsk %VRCX, %VRCX > > + KMOV %VRCX, %k1 > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > + ret > > + > > +L(less_1x_vec): > > + mov $-1, %VRCX > > + bzhi %VRDX, %VRCX, %VRCX > > + KMOV %VRCX, %k1 > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > + > > + ret > > +# else > > + KMOV %k0, %VMASK_REG > > + /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf > > + %VMASK_REG, %VRCX` for wcsncat. */ > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > + cmpq %rcx, %rdx > > + jbe L(less_1x_vec) > > + > > + /* If there were no zero-CHARs (rcx was zero before > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > + cmpl $CHAR_PER_VEC, %ecx > > + je L(more_1x_vec) > > + > > + movl %ecx, %edx > > + > > +L(less_1x_vec): > > +# if VEC_SIZE == 64 > > + cmpl $(32 / CHAR_SIZE), %edx > > + jae L(copy_32_63) > > +# endif > > + > > + cmpl $(16 / CHAR_SIZE), %edx > > + jae L(copy_16_31) > > + > > + > > + cmpl $(8 / CHAR_SIZE), %edx > > + jae L(copy_8_15) > > + > > +# ifdef USE_AS_WCSCPY > > + vmovd %VMM_128(0), (%rdi) > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + ret > > +# else > > + > > + cmpl $4, %edx > > + jae L(copy_4_7) > > + > > + movzbl (%rsi), %ecx > > + cmpl $1, %edx > > + jbe L(set_null_term) > > + > > + movzwl 1(%rsi), %esi > > + movw %si, 1(%rdi) > > + > > + .p2align 4,, 1 > > +L(set_null_term): > > + movb %cl, (%rdi) > > + movNULL $0, (%rdi, %rdx) > > + ret > > +# endif > > + > > +# if VEC_SIZE == 64 > > + .p2align 4,, 6 > > +L(copy_32_63): > > + VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > + VMOVU %VMM_256(0), (%rdi) > > + VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + ret > > +# endif > > + .p2align 4,, 6 > > +L(copy_16_31): > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > + and will save code size. */ > > + vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > + VMOVU %VMM_128(0), (%rdi) > > + vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + ret > > + > > + .p2align 4,, 2 > > +L(copy_8_15): > > + movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx > > + vmovq %VMM_128(0), (%rdi) > > + movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + ret > > + > > +# ifndef USE_AS_WCSCPY > > + .p2align 4,, 12 > > +L(copy_4_7): > > + movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx > > + vmovd %VMM_128(0), (%rdi) > > + movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > + ret > > +# endif > > + > > +# endif > > + .p2align 4,, 4 > > +L(zero_len): > > +# ifdef USE_AS_WCSCPY > > + test %rdx, %rdx > > +# endif > > + jne OVERFLOW_STRCAT > > + ret > > > > -#define USE_AS_STRNCAT > > -#define STRCAT STRNCAT > > -#include "strcat-evex.S" > > + .p2align 4,, 8 > > +L(more_1x_vec): > > + VMOVU %VMM(0), (%rdi) > > + > > + /* We are going to align rsi here so will need to be able to re- > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > + > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > + subq %rsi, %rdi > > + andq $-(VEC_SIZE), %rsi > > +L(loop_last_4x_vec): > > + addq %rsi, %rdi > > + subq %rsi, %rdx > > +# ifdef USE_AS_WCSCPY > > + shrq $2, %rdx > > +# endif > > + > > + /* Will need this regardless. */ > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > + VPTESTN %VMM(1), %VMM(1), %k0 > > + KMOV %k0, %VMASK_REG > > + > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > + ja L(more_2x_vec) > > + > > +L(last_2x_vec): > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > + cmpl %ecx, %edx > > + jbe L(ret_vec_x1_len) > > + > > + /* If there were no zero-CHARs (rcx was zero before > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > + cmpl $CHAR_PER_VEC, %ecx > > + jne L(ret_vec_x1) > > + > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + addl $-CHAR_PER_VEC, %edx > > + bzhi %VRDX, %VRCX, %VR8 > > + jz L(ret_vec_x2_len) > > +L(ret_vec_x2): > > + bsf %VRCX, %VRDX > > +L(ret_vec_x2_len): > > + VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > + ret > > + > > + .p2align 4,, 4 > > +L(ret_vec_x1_len): > > + movl %edx, %ecx > > +L(ret_vec_x1): > > + VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) > > + VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > + VZEROUPPER_RETURN > > + > > + > > + .p2align 4,, 8 > > +L(last_4x_vec): > > + addl $-(CHAR_PER_VEC * 4), %edx > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > + VPTESTN %VMM(1), %VMM(1), %k0 > > + KMOV %k0, %VMASK_REG > > + subq $-(VEC_SIZE * 4), %rsi > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpl $(CHAR_PER_VEC * 2), %edx > > + jbe L(last_2x_vec) > > + .p2align 4,, 8 > > +L(more_2x_vec): > > +# ifdef USE_AS_WCSCPY > > + xorl %ecx, %ecx > > +# endif > > + bsf %VMASK_REG, %VRCX > > + jnz L(ret_vec_x1) > > + > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x2) > > + > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > + VPTESTN %VMM(3), %VMM(3), %k0 > > + KMOV %k0, %VMASK_REG > > + > > + cmpq $(CHAR_PER_VEC * 4), %rdx > > + ja L(more_4x_vec) > > + > > + /* Adjust length before going to L(ret_vec_x3_len) or > > + L(ret_vec_x3). */ > > + addl $(CHAR_PER_VEC * -2), %edx > > + > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > + cmpl %ecx, %edx > > + jbe L(ret_vec_x3_len) > > + > > + /* If there were no zero-CHARs (rcx was zero before > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > + cmpl $CHAR_PER_VEC, %ecx > > + jne L(ret_vec_x3) > > + > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + VPTESTN %VMM(4), %VMM(4), %k0 > > + KMOV %k0, %VRCX > > + addl $-CHAR_PER_VEC, %edx > > + bzhi %VRDX, %VRCX, %VR8 > > + jz L(ret_vec_x4_len) > > +L(ret_vec_x4): > > + bsf %VRCX, %VRDX > > +L(ret_vec_x4_len): > > + VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) > > + VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > + ret > > + > > + .p2align 4,, 4 > > +L(ret_vec_x3_len): > > + movl %edx, %ecx > > +L(ret_vec_x3): > > + VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > + VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > + ret > > + > > + .p2align 4,, 8 > > +L(more_4x_vec): > > +# ifdef USE_AS_WCSCPY > > + xorl %ecx, %ecx > > +# endif > > + bsf %VMASK_REG, %VRCX > > + jnz L(ret_vec_x3) > > + > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + VPTESTN %VMM(4), %VMM(4), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x4) > > + > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > + > > + /* Check if we are near the end before aligning. */ > > + cmpq $(CHAR_PER_VEC * 8), %rdx > > + jbe L(last_4x_vec) > > + > > + > > + /* Add rsi to rdx (length) before aligning rsi. NB: Since we > > + filtered out huge lengths this cannot overflow. */ > > +# ifdef USE_AS_WCSCPY > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > +# else > > + addq %rsi, %rdx > > +# endif > > + > > + /* Subtract rsi from rdi before aligning (add back will have > > + correct rdi for aligned rsi). */ > > + subq %rsi, %rdi > > + subq $-(VEC_SIZE * 5), %rsi > > + andq $(VEC_SIZE * -4), %rsi > > + > > + /* Load first half of the loop before entry. */ > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + > > + /* Offset rsi by VEC_SIZE so that we can jump to > > + L(loop_last_4x_vec). */ > > + addq $-(VEC_SIZE), %rsi > > + KORTEST %k2, %k4 > > + jnz L(loop_4x_done) > > + > > + /* Store loop end in r9. */ > > + leaq -(VEC_SIZE * 5)(%rdx), %r9 > > + > > + .p2align 4,, 11 > > +L(loop_4x_vec): > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > + > > + subq $(VEC_SIZE * -4), %rsi > > + cmpq %rsi, %r9 > > + jbe L(loop_last_4x_vec) > > + > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + KORTEST %k2, %k4 > > + jz L(loop_4x_vec) > > + > > +L(loop_4x_done): > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + /* Restore rdi (dst). */ > > + addq %rsi, %rdi > > + > > + /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so > > + test with bsf. */ > > + bsf %VRCX, %VRCX > > + jnz L(ret_vec_x1) > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) > > + > > + KMOV %k2, %VRCX > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x2) > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > + > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + bsf %VRCX, %VRCX > > + jnz L(ret_vec_x3) > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > + > > + KMOV %k4, %VRCX > > + bsf %VRCX, %VRCX > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > + ret > > + > > + > > + .p2align 4,, 4 > > +L(page_cross): > > + movq %rsi, %r8 > > + andq $(VEC_SIZE * -1), %r8 > > + VPCMPEQ (%r8), %VZERO, %k0 > > + > > +# ifdef USE_AS_WCSCPY > > + KMOV %k0, %VR9 > > + shrl $2, %ecx > > + andl $(CHAR_PER_VEC - 1), %ecx > > + shrx %VRCX, %VR9, %VRCX > > +# else > > + KMOV %k0, %VRCX > > + shrx %VRSI, %VRCX, %VRCX > > +# endif > > + > > + subl %esi, %r8d > > + andl $(VEC_SIZE - 1), %r8d > > +# ifdef USE_AS_WCSCPY > > + shrl $2, %r8d > > +# endif > > + cmpq %r8, %rdx > > + jbe L(page_cross_small) > > + /* Optimizing more for space as this is very cold code. This > > + saves 2x cache lines. */ > > + > > + /* This adds once to the later result which will get correct > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > + to be in the page cross case rsi cannot be aligned and we > > + already right-shift rcx by the misalignment. */ > > + shl %VRCX > > + jz L(page_cross_continue) > > + bsf %VRCX, %VRCX > > + REP_MOVS > > + ret > > + > > +L(page_cross_small): > > + tzcnt %VRCX, %VRCX > > + jz L(page_cross_setz) > > + cmpl %edx, %ecx > > + cmova %edx, %ecx > > + > > +# ifdef USE_AS_WCSCPY > > + rep movsd > > +# else > > + rep movsb > > +# endif > > +L(page_cross_setz): > > + movNULL $0, (%rdi) > > + ret > > +END(STRNCAT) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S > > index 1b3426d511..49eaf4cbd9 100644 > > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S > > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S > > @@ -1,7 +1,990 @@ > > -#ifndef STRNCPY > > -# define STRNCPY __strncpy_evex > > -#endif > > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + <https://www.gnu.org/licenses/>. */ > > + > > +#include <isa-level.h> > > + > > +#if ISA_SHOULD_BUILD (4) > > + > > + /* Use evex-masked stores for small sizes. Turned off at the > > + moment. */ > > +# define USE_EVEX_MASKED_STORE 0 > > + > > + > > +# include <sysdep.h> > > +# ifndef VEC_SIZE > > +# include "x86-evex256-vecs.h" > > +# endif > > + > > + > > +# ifndef STRNCPY > > +# define STRNCPY __strncpy_evex > > +# endif > > + > > +# ifdef USE_AS_WCSCPY > > +# define VMOVU_MASK vmovdqu32 > > +# define VPCMPEQ vpcmpeqd > > +# define VPMIN vpminud > > +# define VPTESTN vptestnmd > > +# define VPTEST vptestmd > > +# define CHAR_SIZE 4 > > + > > +# define REP_MOVS rep movsd > > +# define REP_STOS rep stosl > > + > > +# define USE_WIDE_CHAR > > + > > +# else > > +# define VMOVU_MASK vmovdqu8 > > +# define VPCMPEQ vpcmpeqb > > +# define VPMIN vpminub > > +# define VPTESTN vptestnmb > > +# define VPTEST vptestmb > > +# define CHAR_SIZE 1 > > + > > +# define REP_MOVS rep movsb > > +# define REP_STOS rep stosb > > +# endif > > + > > +# include "strncpy-or-cat-overflow-def.h" > > + > > +# define PAGE_SIZE 4096 > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > + > > +# include "reg-macros.h" > > + > > + > > +# define VZERO VMM(7) > > +# define VZERO_256 VMM_256(7) > > +# define VZERO_128 VMM_128(7) > > + > > +# if VEC_SIZE == 64 > > +# define VZERO_HALF VZERO_256 > > +# else > > +# define VZERO_HALF VZERO_128 > > +# endif > > + > > + .section SECTION(.text), "ax", @progbits > > +ENTRY(STRNCPY) > > + /* Filter zero length strings and very long strings. Zero > > + length strings just return, very long strings are handled by > > + just running rep stos{b|l} to zero set (which will almost > > + certainly segfault), if that succeeds then just calling > > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ > > +# ifdef USE_AS_WCSCPY > > + decq %rdx > > + movq %rdx, %rax > > + /* 56 is end of max supported address space. */ > > + shr $56, %rax > > + jnz L(zero_len) > > +# else > > + decq %rdx > > + /* If the flag needs to become `jb` replace `dec` with `sub`. > > + */ > > + jl L(zero_len) > > +# endif > > + > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > + movl %esi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(page_cross) > > + > > +L(page_cross_continue): > > + VMOVU (%rsi), %VMM(0) > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + > > + /* If no STPCPY just save end ahead of time. */ > > +# ifndef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > + > > + > > + cmpq $(CHAR_PER_VEC), %rdx > > + > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > + <= CHAR_PER_VEC with masked instructions (which have > > + potential for dramatically bad perf if dst splits a page and > > + is not in the TLB). */ > > +# if USE_EVEX_MASKED_STORE > > + /* `jae` because length rdx is now length - 1. */ > > + jae L(more_1x_vec) > > + > > + /* If there where multiple zero-CHAR matches in the first VEC, > > + VRCX will be overset but thats fine since any oversets where > > + at zero-positions anyways. */ > > + > > +# ifdef USE_AS_STPCPY > > + tzcnt %VRCX, %VRAX > > + cmpl %eax, %edx > > + cmovb %edx, %eax > > +# ifdef USE_AS_WCSCPY > > + adcl $0, %eax > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > +# else > > + adcq %rdi, %rax > > +# endif > > +# endif > > + dec %VRCX > > + > > + /* Zero out all non-zero CHAR's after the first zero match. */ > > + KMOV %VRCX, %k1 > > + > > + /* Use VZERO as destination so this can be reused for > > + L(zfill_less_vec) (which if jumped to by subsequent logic > > + will have zerod out VZERO. */ > > + VMOVU_MASK %VMM(0), %VZERO{%k1}{z} > > +L(zfill_less_vec): > > + /* Get mask for what we need to set. */ > > + incl %edx > > + mov $-1, %VRCX > > + bzhi %VRDX, %VRCX, %VRCX > > + KMOV %VRCX, %k1 > > + VMOVU_MASK %VZERO, (%rdi){%k1} > > + ret > > + > > + .p2align 4,, 4 > > +L(zero_len): > > + cmpq $-1, %rdx > > + jne L(best_effort_strncpy) > > + movq %rdi, %rax > > + ret > > + > > + .p2align 4,, 8 > > +L(more_1x_vec): > > +# else > > + /* `jb` because length rdx is now length - 1. */ > > + jb L(less_1x_vec) > > +# endif > > + > > + > > + /* This may overset but thats fine because we still need to zero > > + fill. */ > > + VMOVU %VMM(0), (%rdi) > > + > > + > > + /* Length must be >= CHAR_PER_VEC so match here means we must > > + zero-fill. */ > > + test %VRCX, %VRCX > > + jnz L(zfill) > > + > > + > > + /* We are going to align rsi here so will need to be able to re- > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > + subq %rsi, %rdi > > + andq $-(VEC_SIZE), %rsi > > + > > +L(loop_last_4x_vec): > > + addq %rsi, %rdi > > + subq %rsi, %rdx > > +# ifdef USE_AS_WCSCPY > > + shrq $2, %rdx > > +# endif > > + > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > + VPTESTN %VMM(1), %VMM(1), %k0 > > + KMOV %k0, %VRCX > > + > > + /* -1 because of the `dec %rdx` earlier. */ > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > + ja L(more_2x_vec) > > + > > +L(last_2x_vec): > > + /* This will be need to be computed no matter what. We do it > > + ahead of time for CHAR_PER_VEC == 64 because we can't adjust > > + the value of `tzcnt` with a shift. */ > > +# if CHAR_PER_VEC == 64 > > + tzcntq %rcx, %rcx > > +# endif > > + > > + cmpl $(CHAR_PER_VEC), %edx > > + jb L(ret_vec_x1_len) > > + > > + /* Seperate logic for CHAR_PER_VEC == 64 because we already did > > + `tzcnt` on VRCX. */ > > +# if CHAR_PER_VEC == 64 > > + /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ > > + cmpb $CHAR_PER_VEC, %cl > > + jnz L(ret_vec_x1_no_bsf) > > +# else > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x1) > > +# endif > > + > > + > > + > > + VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + KMOV %k0, %VRCX > > + > > +# if CHAR_PER_VEC < 64 > > + /* This essentiallys adds CHAR_PER_VEC to computed result. */ > > + shlq $CHAR_PER_VEC, %rcx > > +# else > > + tzcntq %rcx, %rcx > > + addl $CHAR_PER_VEC, %ecx > > +# endif > > + > > + .p2align 4,, 4 > > +L(ret_vec_x1_len): > > + /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has > > + already been done. */ > > +# if CHAR_PER_VEC < 64 > > + tzcntq %rcx, %rcx > > +# endif > > + cmpl %ecx, %edx > > + jbe L(ret_vec_x1_len_no_zfill) > > + /* Fall through (expectation) is copy len < buffer len. */ > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > +L(ret_vec_x1_len_no_zfill_mov): > > + movl %ecx, %edx > > +# ifdef USE_AS_STPCPY > > + /* clear flags. */ > > + xorl %ecx, %ecx > > +# endif > > +L(ret_vec_x1_len_no_zfill): > > + VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + leal (VEC_SIZE)(%rdx), %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > + > > + > > + .p2align 4,, 10 > > +L(ret_vec_x1): > > + bsf %VRCX, %VRCX > > +L(ret_vec_x1_no_bsf): > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > + subl %ecx, %edx > > + cmpl $CHAR_PER_VEC, %edx > > + jb L(ret_vec_x1_len_no_zfill_mov) > > + /* Fall through (expectation) is copy len < buffer len. */ > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > + leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax > > +# endif > > + ret > > + > > + .p2align 4,, 8 > > +L(last_4x_vec): > > + /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl > > + $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just > > + using `movzbl`. */ > > +# if CHAR_PER_VEC == 64 > > + movzbl %dl, %edx > > +# else > > + andl $(CHAR_PER_VEC * 4 - 1), %edx > > +# endif > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > + VPTESTN %VMM(1), %VMM(1), %k0 > > + KMOV %k0, %VRCX > > + subq $-(VEC_SIZE * 4), %rsi > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpl $(CHAR_PER_VEC * 2 - 1), %edx > > + jbe L(last_2x_vec) > > + .p2align 4,, 8 > > +L(more_2x_vec): > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > + test %VRCX, %VRCX > > + /* Must fill at least 2x VEC. */ > > + jnz L(zfill_vec1) > > + > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + /* Must fill at least 1x VEC. */ > > + jnz L(zfill_vec2) > > + > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > + VPTESTN %VMM(3), %VMM(3), %k0 > > + KMOV %k0, %VRCX > > + > > + /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ > > + cmpq $(CHAR_PER_VEC * 4 - 1), %rdx > > + ja L(more_4x_vec) > > + > > + subl $(CHAR_PER_VEC * 3), %edx > > + jb L(ret_vec_x3_len) > > + > > + test %VRCX, %VRCX > > + jnz L(ret_vec_x3) > > + > > + VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + KMOV %k0, %VRCX > > + tzcnt %VRCX, %VRCX > > + cmpl %ecx, %edx > > + jbe L(ret_vec_x4_len_no_zfill) > > + /* Fall through (expectation) is copy len < buffer len. */ > > + VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > + movl %ecx, %edx > > +L(ret_vec_x4_len_no_zfill): > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + leal (VEC_SIZE * 4 + 0)(%rdx), %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > + > > + > > +L(ret_vec_x3_len): > > + addl $(CHAR_PER_VEC * 1), %edx > > + tzcnt %VRCX, %VRCX > > + cmpl %ecx, %edx > > + jbe L(ret_vec_x3_len_no_zfill) > > + /* Fall through (expectation) is copy len < buffer len. */ > > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > +L(ret_vec_x3_len_no_zfill_mov): > > + movl %ecx, %edx > > +# ifdef USE_AS_STPCPY > > + /* clear flags. */ > > + xorl %ecx, %ecx > > +# endif > > + .p2align 4,, 4 > > +L(ret_vec_x3_len_no_zfill): > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > + VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + leal (VEC_SIZE * 3 + 0)(%rdx), %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > + > > + > > + .p2align 4,, 8 > > +L(ret_vec_x3): > > + bsf %VRCX, %VRCX > > + VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) > > + subl %ecx, %edx > > + jl L(ret_vec_x3_len_no_zfill_mov) > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > > +# endif > > + ret > > + > > + .p2align 4,, 8 > > +L(more_4x_vec): > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > + test %VRCX, %VRCX > > + jnz L(zfill_vec3) > > + > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > + VPTESTN %VMM(4), %VMM(4), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(zfill_vec4) > > > > -#define USE_AS_STRNCPY > > -#define STRCPY STRNCPY > > -#include "strcpy-evex.S" > > + /* Recheck length before aligning. */ > > + cmpq $(CHAR_PER_VEC * 8 - 1), %rdx > > + jbe L(last_4x_vec) > > + > > + /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ > > +# ifdef USE_AS_WCSCPY > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > +# else > > + addq %rsi, %rdx > > +# endif > > + subq %rsi, %rdi > > + subq $-(VEC_SIZE * 5), %rsi > > + andq $(VEC_SIZE * -4), %rsi > > + > > + > > + /* Load first half of the loop before entry. */ > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + > > + > > + /* Offset rsi by VEC_SIZE so that we can jump to > > + L(loop_last_4x_vec). */ > > + addq $-(VEC_SIZE), %rsi > > + KORTEST %k2, %k4 > > + jnz L(loop_4x_done) > > + > > + /* Store loop end in r9. */ > > + leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 > > + > > + .p2align 4,, 11 > > +L(loop_4x_vec): > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > + > > + subq $(VEC_SIZE * -4), %rsi > > + cmpq %rsi, %r9 > > + jbe L(loop_last_4x_vec) > > + > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > + > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > + VPTESTN %VMM(4), %VMM(4), %k2 > > + VPTESTN %VMM(6), %VMM(6), %k4 > > + KORTEST %k2, %k4 > > + jz L(loop_4x_vec) > > + > > +L(loop_4x_done): > > + /* Restore rdx (length). */ > > + subq %rsi, %rdx > > +# ifdef USE_AS_WCSCPY > > + shrq $2, %rdx > > +# endif > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > + /* Restore rdi (dst). */ > > + addq %rsi, %rdi > > + VPTESTN %VMM(0), %VMM(0), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(zfill_vec1) > > + > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > + KMOV %k2, %VRCX > > + test %VRCX, %VRCX > > + jnz L(zfill_vec2) > > + > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > + VPTESTN %VMM(2), %VMM(2), %k0 > > + KMOV %k0, %VRCX > > + test %VRCX, %VRCX > > + jnz L(zfill_vec3) > > + > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) > > + KMOV %k4, %VRCX > > + // Zfill more.... > > + > > + .p2align 4,, 4 > > +L(zfill_vec4): > > + subq $(VEC_SIZE * -2), %rdi > > + addq $(CHAR_PER_VEC * -2), %rdx > > +L(zfill_vec2): > > + subq $(VEC_SIZE * -2), %rdi > > + addq $(CHAR_PER_VEC * -1), %rdx > > +L(zfill): > > + /* VRCX must be non-zero. */ > > + bsf %VRCX, %VRCX > > + > > + /* Adjust length / dst for zfill. */ > > + subq %rcx, %rdx > > +# ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > +L(zfill_from_page_cross): > > + > > + /* From here on out its just memset(rdi, 0, rdx). */ > > + cmpq $CHAR_PER_VEC, %rdx > > + jb L(zfill_less_vec) > > + > > +L(zfill_more_1x_vec): > > + VMOVU %VZERO, (%rdi) > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > + ja L(zfill_more_2x_vec) > > +L(zfill_done0): > > + ret > > + > > + /* Coming from vec1/vec2 we must be able to zfill at least 2x > > + VEC. */ > > + .p2align 4,, 8 > > +L(zfill_vec3): > > + subq $(VEC_SIZE * -2), %rdi > > + addq $(CHAR_PER_VEC * -2), %rdx > > + .p2align 4,, 2 > > +L(zfill_vec1): > > + bsfq %rcx, %rcx > > + /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. > > + */ > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > + subq %rcx, %rdx > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > + > > + > > + VMOVU %VZERO, (%rdi) > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > + jb L(zfill_done0) > > +L(zfill_more_2x_vec): > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > + VMOVU %VZERO, (VEC_SIZE)(%rdi) > > + subq $(CHAR_PER_VEC * 4 - 1), %rdx > > + jbe L(zfill_done) > > + > > +# ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rdx, CHAR_SIZE), %rdx > > +# else > > + addq %rdi, %rdx > > +# endif > > + > > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) > > + > > + > > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) > > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) > > + > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpq %rdi, %rdx > > + jbe L(zfill_done) > > + > > + /* Align rdi and zfill loop. */ > > + andq $-(VEC_SIZE), %rdi > > + .p2align 4,, 12 > > +L(zfill_loop_4x_vec): > > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) > > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) > > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) > > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) > > + subq $-(VEC_SIZE * 4), %rdi > > + cmpq %rdi, %rdx > > + ja L(zfill_loop_4x_vec) > > +L(zfill_done): > > + ret > > + > > + > > + /* Less 1x VEC case if we are not using evex masked store. */ > > +# if !USE_EVEX_MASKED_STORE > > + .p2align 4,, 8 > > +L(copy_1x): > > + /* Special case for copy 1x. It can be handled quickly and many > > + buffer sizes have convenient alignment. */ > > + VMOVU %VMM(0), (%rdi) > > + /* If no zeros then we are done. */ > > + testl %ecx, %ecx > > + jz L(ret_1x_1x) > > + > > + /* Need to zfill, not we know that length <= CHAR_PER_VEC so we > > + only handle the small case here. */ > > + bsf %VRCX, %VRCX > > +L(zfill_less_vec_no_bsf): > > + /* Adjust length / dst then just zfill less_vec. */ > > + subq %rcx, %rdx > > +# ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > + > > +L(zfill_less_vec): > > + cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx > > + jb L(zfill_less_half) > > + > > + VMOVU %VZERO_HALF, (%rdi) > > + VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + ret > > +# ifdef USE_AS_STPCPY > > +L(ret_1x_1x): > > + leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax > > + ret > > +# endif > > + > > + > > +# if VEC_SIZE == 64 > > + .p2align 4,, 4 > > +L(copy_32_63): > > + /* Overfill to avoid branches. */ > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > + VMOVU %VMM_256(0), (%rdi) > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + > > + /* We are taking advantage of the fact that to be here we must > > + be writing null-term as (%rdi, %rcx) we have a byte of lee- > > + way for overwriting. */ > > + cmpl %ecx, %edx > > + ja L(zfill_less_vec_no_bsf) > > +# ifndef USE_AS_STPCPY > > +L(ret_1x_1x): > > +# else > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + movl %edx, %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > +# endif > > + > > + .p2align 4,, 4 > > +L(copy_16_31): > > + /* Overfill to avoid branches. */ > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > + VMOVU %VMM_128(0), (%rdi) > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + cmpl %ecx, %edx > > + > > + /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then > > + we have a larger copy block for 32-63 so this is just falls > > + through to zfill 16-31. If VEC_SIZE == 32 then we check for > > + full zfill of less 1x VEC. */ > > +# if VEC_SIZE == 64 > > + jbe L(ret_16_31) > > + subl %ecx, %edx > > +# ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > +L(zfill_less_half): > > +L(zfill_less_32): > > + cmpl $(16 / CHAR_SIZE), %edx > > + jb L(zfill_less_16) > > + VMOVU %VZERO_128, (%rdi) > > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > + ret > > +# endif > > +L(ret_16_31): > > +# ifdef USE_AS_STPCPY > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + movl %edx, %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > +# else > > + /* VEC_SIZE == 32 begins. */ > > + ja L(zfill_less_vec_no_bsf) > > +# ifndef USE_AS_STPCPY > > +L(ret_1x_1x): > > +# else > > +# ifdef USE_AS_WCSCPY > > + adcq $0, %rdx > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + movl %edx, %eax > > + adcq %rdi, %rax > > +# endif > > +# endif > > + ret > > +# endif > > + > > + > > + .p2align 4,, 4 > > +L(copy_8_15): > > + /* Overfill to avoid branches. */ > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > + vmovq %VMM_128(0), (%rdi) > > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > + cmpl %ecx, %edx > > + jbe L(ret_8_15) > > + subl %ecx, %edx > > +# ifdef USE_AS_WCSCPY > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > +# else > > + addq %rcx, %rdi > > +# endif > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > + .p2align 4,, 8 > > +# if VEC_SIZE == 32 > > +L(zfill_less_half): > > +# endif > > +L(zfill_less_16): > > + xorl %ecx, %ecx > > + cmpl $(8 / CHAR_SIZE), %edx > > + jb L(zfill_less_8) > > + movq %rcx, (%rdi) > > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > +# ifndef USE_AS_STPCPY > > +L(ret_8_15): > > +# endif > > + ret > > + > > + .p2align 4,, 8 > > +L(less_1x_vec): > > + je L(copy_1x) > > + > > + /* We will need `tzcnt` result for all other copy sizes. */ > > + tzcnt %VRCX, %VRCX > > +# if VEC_SIZE == 64 > > + cmpl $(32 / CHAR_SIZE), %edx > > + jae L(copy_32_63) > > +# endif > > + > > + cmpl $(16 / CHAR_SIZE), %edx > > + jae L(copy_16_31) > > + > > + cmpl $(8 / CHAR_SIZE), %edx > > + jae L(copy_8_15) > > +# ifdef USE_AS_WCSCPY > > + testl %ecx, %ecx > > + jz L(zfill_less_8_set_ret) > > + > > + movl (%rsi, %rdx, CHAR_SIZE), %esi > > + vmovd %VMM_128(0), (%rdi) > > + movl %esi, (%rdi, %rdx, CHAR_SIZE) > > +# ifdef USE_AS_STPCPY > > + cmpl %ecx, %edx > > +L(ret_8_15): > > + adcq $0, %rdx > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# endif > > + ret > > +L(zfill_less_8_set_ret): > > + xorl %ecx, %ecx > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > +L(zfill_less_8): > > + movl %ecx, (%rdi) > > + movl %ecx, (%rdi, %rdx, CHAR_SIZE) > > + ret > > +# else > > + cmpl $3, %edx > > + jb L(copy_0_3) > > + /* Overfill to avoid branches. */ > > + movl -3(%rsi, %rdx), %esi > > + vmovd %VMM_128(0), (%rdi) > > + movl %esi, -3(%rdi, %rdx) > > + cmpl %ecx, %edx > > + jbe L(ret_4_7) > > + subq %rcx, %rdx > > + addq %rcx, %rdi > > +# ifdef USE_AS_STPCPY > > + movq %rdi, %rax > > +# endif > > + xorl %ecx, %ecx > > + .p2align 4,, 8 > > +L(zfill_less_8): > > + cmpl $3, %edx > > + jb L(zfill_less_3) > > + movl %ecx, (%rdi) > > + movl %ecx, -3(%rdi, %rdx) > > +# ifdef USE_AS_STPCPY > > + ret > > +# endif > > + > > +L(ret_4_7): > > +# ifdef USE_AS_STPCPY > > +L(ret_8_15): > > + movl %edx, %eax > > + adcq %rdi, %rax > > +# endif > > + ret > > + > > + .p2align 4,, 4 > > +L(zfill_less_3): > > + testl %edx, %edx > > + jz L(zfill_1) > > + movw %cx, (%rdi) > > +L(zfill_1): > > + movb %cl, (%rdi, %rdx) > > + ret > > + > > + .p2align 4,, 8 > > +L(copy_0_3): > > + vmovd %VMM_128(0), %r8d > > + testl %edx, %edx > > + jz L(copy_1) > > + movw %r8w, (%rdi) > > + cmpl %ecx, %edx > > + ja L(zfill_from_1) > > + movzbl (%rsi, %rdx), %r8d > > +# ifdef USE_AS_STPCPY > > + movl %edx, %eax > > + adcq %rdi, %rax > > + movb %r8b, (%rdi, %rdx) > > + ret > > +# endif > > + > > +L(copy_1): > > +# ifdef USE_AS_STPCPY > > + movl %edx, %eax > > + cmpl %ecx, %edx > > + adcq %rdi, %rax > > +# endif > > +# ifdef USE_AS_WCSCPY > > + vmovd %VMM_128(0), (%rdi) > > +# else > > + movb %r8b, (%rdi, %rdx) > > +# endif > > + ret > > +# endif > > + > > + > > +# ifndef USE_AS_WCSCPY > > + .p2align 4,, 8 > > +L(zfill_from_1): > > +# ifdef USE_AS_STPCPY > > + leaq (%rdi, %rcx), %rax > > +# endif > > + movw $0, -1(%rdi, %rdx) > > + ret > > +# endif > > + > > + .p2align 4,, 4 > > +L(zero_len): > > + incq %rdx > > + jne L(best_effort_strncpy) > > + movq %rdi, %rax > > + ret > > +# endif > > + > > + > > + .p2align 4,, 4 > > + .p2align 6,, 8 > > +L(page_cross): > > + movq %rsi, %rax > > + andq $(VEC_SIZE * -1), %rax > > + VPCMPEQ (%rax), %VZERO, %k0 > > + KMOV %k0, %VRCX > > +# ifdef USE_AS_WCSCPY > > + movl %esi, %r8d > > + shrl $2, %r8d > > + andl $(CHAR_PER_VEC - 1), %r8d > > + shrx %VR8, %VRCX, %VRCX > > +# else > > + shrx %VRSI, %VRCX, %VRCX > > +# endif > > + > > + /* Compute amount of bytes we checked. */ > > + subl %esi, %eax > > + andl $(VEC_SIZE - 1), %eax > > +# ifdef USE_AS_WCSCPY > > + shrl $2, %eax > > +# endif > > + > > + /* If rax > rdx then we are finishing the copy at the end of the > > + page. */ > > + cmpq %rax, %rdx > > + jb L(page_cross_small) > > + > > + > > + /* If rcx is non-zero then continue. */ > > + test %VRCX, %VRCX > > + jz L(page_cross_continue) > > + > > + /* We found zero-CHAR so need to copy then zfill (we know we > > + didn't cover all of length here). */ > > + bsf %VRCX, %VRCX > > +L(movsb_and_zfill): > > + incl %ecx > > + subq %rcx, %rdx > > +# ifdef USE_AS_STPCPY > > + leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > +# else > > + movq %rdi, %rax > > +# endif > > + > > + REP_MOVS > > +# ifdef USE_AS_WCSCPY > > + movl $0, (%rdi) > > +# else > > + movb $0, (%rdi) > > +# endif > > + jmp L(zfill_from_page_cross) > > + > > +L(page_cross_small): > > + tzcnt %VRCX, %VRCX > > + cmpl %ecx, %edx > > + jbe L(page_cross_copy_only) > > + > > + /* Do a zfill of the tail before copying. */ > > + movq %rdi, %r9 > > + xorl %eax, %eax > > + > > + movl %ecx, %r8d > > + > > + subl %ecx, %edx > > + leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > + movl %edx, %ecx > > + REP_STOS > > + movq %r9, %rdi > > + movl %r8d, %edx > > +L(page_cross_copy_only): > > + leal 1(%rdx), %ecx > > +# ifdef USE_AS_STPCPY > > +# ifdef USE_AS_WCSCPY > > + adcl $0, %edx > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > +# else > > + movl %edx, %eax > > + adcq %rdi, %rax > > +# endif > > +# else > > + movq %rdi, %rax > > +# endif > > + REP_MOVS > > + ret > > + > > + > > +L(best_effort_strncpy): > > + movq %rdx, %rcx > > + xorl %eax, %eax > > + movq %rdi, %r8 > > + /* The length is >= 2^63. We very much so expect to segfault at > > + rep stos. If that doesn't happen then just strcpy to finish. > > + */ > > + REP_STOS > > + movq %r8, %rdi > > + jmp OVERFLOW_STRCPY > > +END(STRNCPY) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > new file mode 100644 > > index 0000000000..d5ff4cbe50 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > Please add a copyright notice. > > > @@ -0,0 +1,65 @@ > > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ > > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1 > > + > > +#if defined USE_MULTIARCH && IS_IN(libc) > > +# define UNDERSCORES __ > > +# ifdef USE_WITH_SSE2 > > +# define ISA_EXT _sse2 > > +# elif defined USE_WITH_AVX > > +# ifdef USE_WITH_RTM > > +# define ISA_EXT _avx_rtm > > +# else > > +# define ISA_EXT _avx > > +# endif > > +# elif defined USE_WITH_AVX2 > > Do we have a function with both AVX and AVX2 versions? If not, should > keep just 1. > > > +# ifdef USE_WITH_RTM > > +# define ISA_EXT _avx2_rtm > > +# else > > +# define ISA_EXT _avx2 > > +# endif > > + > > +# elif defined USE_WITH_EVEX256 > > +# define ISA_EXT _evex > > +# elif defined USE_WITH_EVEX512 > > +# define ISA_EXT _evex512 > > +# endif > > +#else > > +# define UNDERSCORES > > +# define ISA_EXT > > +#endif > > + > > +#ifdef USE_AS_WCSCPY > > +# define STRCPY_PREFIX wc > > +# define STRCAT_PREFIX wcs > > +# ifdef USE_AS_STPCPY > > +# define STRCPY_POSTFIX pcpy > > +# else > > +# define STRCPY_POSTFIX scpy > > +# endif > > +#else > > +# define STRCPY_PREFIX st > > +# define STRCAT_PREFIX str > > +# ifdef USE_AS_STPCPY > > +# define STRCPY_POSTFIX pcpy > > +# else > > +# define STRCPY_POSTFIX rcpy > > +# endif > > +#endif > > +#define STRCAT_POSTFIX cat > > + > > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \ > > + underscores##prefix##postfix##ext > > + > > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__) > > + > > +#ifndef OVERFLOW_STRCPY > > +# define OVERFLOW_STRCPY \ > > + OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT) > > +#endif > > + > > +#ifndef OVERFLOW_STRCAT > > +# define OVERFLOW_STRCAT \ > > + OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT) > > +#endif > > + > > +#endif > > -- > > 2.34.1 > > > > H.J.
On Fri, Nov 4, 2022 at 3:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote: > > > Optimizations are: > > > 1. Use more overlapping stores to avoid branches. > > > 2. Reduce how unrolled the aligning copies are (this is more of a > > > code-size save, its a negative for some sizes in terms of > > > perf). > > > 3. Improve the loop a bit (similiar to what we do in strlen with > > > 2x vpminu + kortest instead of 3x vpminu + kmov + test). > > > 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the > > > number that are taken. > > > > > > Performance Changes: > > > > > > Times are from N = 10 runs of the benchmark suite and are > > > reported as geometric mean of all ratios of > > > New Implementation / Old Implementation. > > > > > > stpcpy-evex -> 0.922 > > > strcat-evex -> 0.985 > > > strcpy-evex -> 0.880 > > > > > > strncpy-evex -> 0.831 > > > stpncpy-evex -> 0.780 > > > > > > strncat-evex -> 0.958 > > > > > > Code Size Changes: > > > function -> Bytes New / Bytes Old -> Ratio > > > > > > strcat-evex -> 819 / 1874 -> 0.437 > > > strcpy-evex -> 700 / 1074 -> 0.652 > > > stpcpy-evex -> 735 / 1094 -> 0.672 > > > > > > strncpy-evex -> 1397 / 2611 -> 0.535 > > > stpncpy-evex -> 1489 / 2691 -> 0.553 > > > > > > strncat-evex -> 1184 / 2832 -> 0.418 > > > > > > Notes: > > > 1. Because of the significant difference between the > > > implementations they are split into three files. > > > > > > strcpy-evex.S -> strcpy, stpcpy, strcat > > > strncpy-evex.S -> strncpy > > > strncat-evex.S > strncat > > > > > > I couldn't find a way to merge them without making the > > > ifdefs incredibly difficult to follow. > > > > > > 2. All implementations can be made evex512 by including > > > "x86-evex512-vecs.h" at the top. > > > > > > 3. All implementations have an optional define: > > > `USE_EVEX_MASKED_STORE` > > > Setting to one uses evex-masked stores for handling short > > > strings. This saves code size and branches. It's disabled > > > for all implementations are the moment as there are some > > > serious drawbacks to masked stores in certain cases, but > > > that may be fixed on future architectures. > > > > > > Full check passes on x86-64 and build succeeds for all ISA levels w/ > > > and w/o multiarch. > > > --- > > > sysdeps/x86_64/multiarch/stpncpy-evex.S | 5 +- > > > sysdeps/x86_64/multiarch/strcat-evex.S | 291 +--- > > > sysdeps/x86_64/multiarch/strcat-strlen-evex.S | 110 ++ > > > sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 ++++++----------- > > > sysdeps/x86_64/multiarch/strncat-evex.S | 525 ++++++- > > > sysdeps/x86_64/multiarch/strncpy-evex.S | 995 ++++++++++++- > > > .../multiarch/strncpy-or-cat-overflow-def.h | 65 + > > > 7 files changed, 2100 insertions(+), 1173 deletions(-) > > > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > index 99ea76a372..3693491baa 100644 > > > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > @@ -3,6 +3,5 @@ > > > #endif > > > > > > #define USE_AS_STPCPY > > > -#define USE_AS_STRNCPY > > > -#define STRCPY STPNCPY > > > -#include "strcpy-evex.S" > > > +#define STRNCPY STPNCPY > > > +#include "strncpy-evex.S" > > > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S > > > index 0e2df947e9..b4207b7889 100644 > > > --- a/sysdeps/x86_64/multiarch/strcat-evex.S > > > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S > > > @@ -1,286 +1,7 @@ > > > -/* strcat with 256-bit EVEX instructions. > > > - Copyright (C) 2021-2022 Free Software Foundation, Inc. > > > - This file is part of the GNU C Library. > > > - > > > - The GNU C Library is free software; you can redistribute it and/or > > > - modify it under the terms of the GNU Lesser General Public > > > - License as published by the Free Software Foundation; either > > > - version 2.1 of the License, or (at your option) any later version. > > > - > > > - The GNU C Library is distributed in the hope that it will be useful, > > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > - Lesser General Public License for more details. > > > - > > > - You should have received a copy of the GNU Lesser General Public > > > - License along with the GNU C Library; if not, see > > > - <https://www.gnu.org/licenses/>. */ > > > - > > > -#include <isa-level.h> > > > - > > > -#if ISA_SHOULD_BUILD (4) > > > - > > > - > > > -# include <sysdep.h> > > > - > > > -# ifndef STRCAT > > > -# define STRCAT __strcat_evex > > > -# endif > > > - > > > -# define VMOVU vmovdqu64 > > > -# define VMOVA vmovdqa64 > > > - > > > -/* zero register */ > > > -# define XMMZERO xmm16 > > > -# define YMMZERO ymm16 > > > -# define YMM0 ymm17 > > > -# define YMM1 ymm18 > > > - > > > -# define USE_AS_STRCAT > > > - > > > -/* Number of bytes in a vector register */ > > > -# define VEC_SIZE 32 > > > - > > > - .section .text.evex,"ax",@progbits > > > -ENTRY (STRCAT) > > > - mov %rdi, %r9 > > > -# ifdef USE_AS_STRNCAT > > > - mov %rdx, %r8 > > > -# endif > > > - > > > - xor %eax, %eax > > > - mov %edi, %ecx > > > - and $((VEC_SIZE * 4) - 1), %ecx > > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > - cmp $(VEC_SIZE * 3), %ecx > > > - ja L(fourth_vector_boundary) > > > - vpcmpb $0, (%rdi), %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_first_vector) > > > - mov %rdi, %rax > > > - and $-VEC_SIZE, %rax > > > - jmp L(align_vec_size_start) > > > -L(fourth_vector_boundary): > > > - mov %rdi, %rax > > > - and $-VEC_SIZE, %rax > > > - vpcmpb $0, (%rax), %YMMZERO, %k0 > > > - mov $-1, %r10d > > > - sub %rax, %rcx > > > - shl %cl, %r10d > > > - kmovd %k0, %edx > > > - and %r10d, %edx > > > - jnz L(exit) > > > - > > > -L(align_vec_size_start): > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_second_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_third_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fourth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fifth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > - add $(VEC_SIZE * 4), %rax > > > - kmovd %k4, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_second_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_third_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fourth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fifth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > - kmovd %k4, %edx > > > - add $(VEC_SIZE * 4), %rax > > > - test %edx, %edx > > > - jnz L(exit_null_on_second_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_third_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fourth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fifth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > - add $(VEC_SIZE * 4), %rax > > > - kmovd %k4, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_second_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_third_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fourth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fifth_vector) > > > - > > > - test $((VEC_SIZE * 4) - 1), %rax > > > - jz L(align_four_vec_loop) > > > - > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > - add $(VEC_SIZE * 5), %rax > > > - kmovd %k4, %edx > > > - test %edx, %edx > > > - jnz L(exit) > > > - > > > - test $((VEC_SIZE * 4) - 1), %rax > > > - jz L(align_four_vec_loop) > > > - > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > - add $VEC_SIZE, %rax > > > - kmovd %k0, %edx > > > - test %edx, %edx > > > - jnz L(exit) > > > - > > > - test $((VEC_SIZE * 4) - 1), %rax > > > - jz L(align_four_vec_loop) > > > - > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > - add $VEC_SIZE, %rax > > > - kmovd %k0, %edx > > > - test %edx, %edx > > > - jnz L(exit) > > > - > > > - test $((VEC_SIZE * 4) - 1), %rax > > > - jz L(align_four_vec_loop) > > > - > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 > > > - add $VEC_SIZE, %rax > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit) > > > - > > > - add $VEC_SIZE, %rax > > > - > > > - .p2align 4 > > > -L(align_four_vec_loop): > > > - VMOVA (%rax), %YMM0 > > > - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 > > > - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 > > > - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 > > > - vpminub %YMM0, %YMM1, %YMM0 > > > - /* If K0 != 0, there is a null byte. */ > > > - vpcmpb $0, %YMM0, %YMMZERO, %k0 > > > - add $(VEC_SIZE * 4), %rax > > > - ktestd %k0, %k0 > > > - jz L(align_four_vec_loop) > > > - > > > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 > > > - sub $(VEC_SIZE * 5), %rax > > > - kmovd %k0, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_second_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_third_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - test %edx, %edx > > > - jnz L(exit_null_on_fourth_vector) > > > - > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - sub %rdi, %rax > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - add $(VEC_SIZE * 4), %rax > > > - jmp L(StartStrcpyPart) > > > - > > > - .p2align 4 > > > -L(exit): > > > - sub %rdi, %rax > > > -L(exit_null_on_first_vector): > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - jmp L(StartStrcpyPart) > > > - > > > - .p2align 4 > > > -L(exit_null_on_second_vector): > > > - sub %rdi, %rax > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - add $VEC_SIZE, %rax > > > - jmp L(StartStrcpyPart) > > > - > > > - .p2align 4 > > > -L(exit_null_on_third_vector): > > > - sub %rdi, %rax > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - add $(VEC_SIZE * 2), %rax > > > - jmp L(StartStrcpyPart) > > > - > > > - .p2align 4 > > > -L(exit_null_on_fourth_vector): > > > - sub %rdi, %rax > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - add $(VEC_SIZE * 3), %rax > > > - jmp L(StartStrcpyPart) > > > - > > > - .p2align 4 > > > -L(exit_null_on_fifth_vector): > > > - sub %rdi, %rax > > > - bsf %rdx, %rdx > > > - add %rdx, %rax > > > - add $(VEC_SIZE * 4), %rax > > > - > > > - .p2align 4 > > > -L(StartStrcpyPart): > > > - lea (%r9, %rax), %rdi > > > - mov %rsi, %rcx > > > - mov %r9, %rax /* save result */ > > > - > > > -# ifdef USE_AS_STRNCAT > > > - test %r8, %r8 > > > - jz L(ExitZero) > > > -# define USE_AS_STRNCPY > > > -# endif > > > - > > > -# include "strcpy-evex.S" > > > +#ifndef STRCAT > > > +# define STRCAT __strcat_evex > > > #endif > > > + > > > +#define USE_AS_STRCAT > > > +#define STRCPY STRCAT > > > +#include "strcpy-evex.S" > > > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > new file mode 100644 > > > index 0000000000..9530d7b683 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > @@ -0,0 +1,110 @@ > > > +/* strlen used for begining of str{n}cat using EVEX 256/512. > > > + Copyright (C) 2011-2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > + > > > +/* NOTE: This file is meant to be included by strcat-evex or > > > + strncat-evex and does not standalone. Before including %rdi > > > + must be saved in %rax. */ > > > > Since this file isn't standalone, please rename it to .h. > > Can it be .h.S so it plays well it IDE modes? It sounds reasonable. > > > > > + > > > + > > > +/* Simple strlen implementation that ends at > > > + L(strcat_strlen_done). */ > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > + movq %rdi, %r8 > > > + andq $(VEC_SIZE * -1), %r8 > > > + VPCMPEQ (%r8), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > +#ifdef USE_AS_WCSCPY > > > + subl %r8d, %edi > > > + shrl $2, %edi > > > +#endif > > > + shrx %VRDI, %VRCX, %VRCX > > > +#ifdef USE_AS_WCSCPY > > > + movq %rax, %rdi > > > +#endif > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v0) > > > + > > > + > > > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > + leaq (VEC_SIZE)(%r8), %rdi > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v0) > > > + > > > + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v1) > > > + > > > + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v2) > > > + > > > + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v3) > > > + > > > + andq $-(VEC_SIZE * 4), %rdi > > > + .p2align 4,, 8 > > > +L(loop_2x_vec): > > > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) > > > + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) > > > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) > > > + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) > > > + VPTESTN %VMM(1), %VMM(1), %k1 > > > + VPTESTN %VMM(3), %VMM(3), %k3 > > > + subq $(VEC_SIZE * -4), %rdi > > > + KORTEST %k1, %k3 > > > + jz L(loop_2x_vec) > > > + > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v0) > > > + > > > + KMOV %k1, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v1) > > > + > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(bsf_and_done_v2) > > > + > > > + KMOV %k3, %VRCX > > > +L(bsf_and_done_v3): > > > + addq $VEC_SIZE, %rdi > > > +L(bsf_and_done_v2): > > > + bsf %VRCX, %VRCX > > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi > > > + jmp L(strcat_strlen_done) > > > + > > > + .p2align 4,, 4 > > > +L(bsf_and_done_v1): > > > + addq $VEC_SIZE, %rdi > > > +L(bsf_and_done_v0): > > > + bsf %VRCX, %VRCX > > > +#ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +#else > > > + addq %rcx, %rdi > > > +#endif > > > +L(strcat_strlen_done): > > > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S > > > index 82e45ac675..1ba0195ed2 100644 > > > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S > > > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S > > > @@ -1,4 +1,4 @@ > > > -/* strcpy with 256-bit EVEX instructions. > > > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. > > > Copyright (C) 2021-2022 Free Software Foundation, Inc. > > > This file is part of the GNU C Library. > > > > > > @@ -17,990 +17,526 @@ > > > <https://www.gnu.org/licenses/>. */ > > > > > > #include <isa-level.h> > > > - > > > #if ISA_SHOULD_BUILD (4) > > > > > > > > > -# ifndef USE_AS_STRCAT > > > -# include <sysdep.h> > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > + moment. */ > > > +# define USE_EVEX_MASKED_STORE 0 > > > + /* Use movsb in page cross case to save code size. */ > > > +# define USE_MOVSB_IN_PAGE_CROSS 1 > > > > > > -# ifndef STRCPY > > > -# define STRCPY __strcpy_evex > > > -# endif > > > +# include <sysdep.h> > > > > > > +# ifndef VEC_SIZE > > > +# include "x86-evex256-vecs.h" > > > # endif > > > > > > -# define VMOVU vmovdqu64 > > > -# define VMOVA vmovdqa64 > > > - > > > -/* Number of bytes in a vector register */ > > > -# ifndef VEC_SIZE > > > -# define VEC_SIZE 32 > > > +# ifndef STRCPY > > > +# define STRCPY __strcpy_evex > > > # endif > > > > > > -# define XMM2 xmm18 > > > -# define XMM3 xmm19 > > > > > > -# define YMM2 ymm18 > > > -# define YMM3 ymm19 > > > -# define YMM4 ymm20 > > > -# define YMM5 ymm21 > > > -# define YMM6 ymm22 > > > -# define YMM7 ymm23 > > > +# ifdef USE_AS_WCSCPY > > > +# define VMOVU_MASK vmovdqu32 > > > +# define VPMIN vpminud > > > +# define VPTESTN vptestnmd > > > +# define VPTEST vptestmd > > > +# define VPCMPEQ vpcmpeqd > > > +# define CHAR_SIZE 4 > > > > > > -# ifndef USE_AS_STRCAT > > > +# define REP_MOVS rep movsd > > > > > > -/* zero register */ > > > -# define XMMZERO xmm16 > > > -# define YMMZERO ymm16 > > > -# define YMM1 ymm17 > > > - > > > - .section .text.evex,"ax",@progbits > > > -ENTRY (STRCPY) > > > -# ifdef USE_AS_STRNCPY > > > - mov %RDX_LP, %R8_LP > > > - test %R8_LP, %R8_LP > > > - jz L(ExitZero) > > > -# endif > > > - mov %rsi, %rcx > > > -# ifndef USE_AS_STPCPY > > > - mov %rdi, %rax /* save result */ > > > -# endif > > > +# define USE_WIDE_CHAR > > > +# else > > > +# define VMOVU_MASK vmovdqu8 > > > +# define VPMIN vpminub > > > +# define VPTESTN vptestnmb > > > +# define VPTEST vptestmb > > > +# define VPCMPEQ vpcmpeqb > > > +# define CHAR_SIZE 1 > > > > > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > +# define REP_MOVS rep movsb > > > # endif > > > > > > - and $((VEC_SIZE * 4) - 1), %ecx > > > - cmp $(VEC_SIZE * 2), %ecx > > > - jbe L(SourceStringAlignmentLessTwoVecSize) > > > - > > > - and $-VEC_SIZE, %rsi > > > - and $(VEC_SIZE - 1), %ecx > > > - > > > - vpcmpb $0, (%rsi), %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - shr %cl, %rdx > > > +# include "reg-macros.h" > > > > > > -# ifdef USE_AS_STRNCPY > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > - mov $VEC_SIZE, %r10 > > > - sub %rcx, %r10 > > > - cmp %r10, %r8 > > > -# else > > > - mov $(VEC_SIZE + 1), %r10 > > > - sub %rcx, %r10 > > > - cmp %r10, %r8 > > > -# endif > > > - jbe L(CopyVecSizeTailCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > - jnz L(CopyVecSizeTail) > > > - > > > - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > > > > -# ifdef USE_AS_STRNCPY > > > - add $VEC_SIZE, %r10 > > > - cmp %r10, %r8 > > > - jbe L(CopyTwoVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > - jnz L(CopyTwoVecSize) > > > - > > > - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ > > > - VMOVU %YMM2, (%rdi) > > > - > > > -/* If source address alignment != destination address alignment */ > > > - .p2align 4 > > > -L(UnalignVecSizeBoth): > > > - sub %rcx, %rdi > > > -# ifdef USE_AS_STRNCPY > > > - add %rcx, %r8 > > > - sbb %rcx, %rcx > > > - or %rcx, %r8 > > > -# endif > > > - mov $VEC_SIZE, %rcx > > > - VMOVA (%rsi, %rcx), %YMM2 > > > - VMOVU %YMM2, (%rdi, %rcx) > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $(VEC_SIZE * 3), %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec2) > > > +# ifdef USE_AS_STPCPY > > > +# define END_REG rax > > > # else > > > - jnz L(CopyVecSize) > > > +# define END_REG rdi, %rdx, CHAR_SIZE > > > # endif > > > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec3) > > > +# ifdef USE_AS_STRCAT > > > +# define PAGE_ALIGN_REG edx > > > +# define PAGE_ALIGN_REG_64 rdx > > > # else > > > - jnz L(CopyVecSize) > > > +# define PAGE_ALIGN_REG eax > > > +# define PAGE_ALIGN_REG_64 rax > > > # endif > > > > > > - VMOVU %YMM3, (%rdi, %rcx) > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 > > > - vpcmpb $0, %YMM4, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec4) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > +# define VZERO VMM(7) > > > +# define VZERO_128 VMM_128(7) > > > > > > - VMOVU %YMM4, (%rdi, %rcx) > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec2) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec2) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > > - VMOVU %YMM2, (%rdi, %rcx) > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > - add $VEC_SIZE, %rcx > > > -# ifdef USE_AS_STRNCPY > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > -# endif > > > - test %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec3) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > > > > - VMOVU %YMM3, (%rdi, %rcx) > > > - mov %rsi, %rdx > > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > > - and $-(VEC_SIZE * 4), %rsi > > > - sub %rsi, %rdx > > > - sub %rdx, %rdi > > > -# ifdef USE_AS_STRNCPY > > > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 > > > -# endif > > > -L(UnalignedFourVecSizeLoop): > > > - VMOVA (%rsi), %YMM4 > > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > > - vpminub %YMM5, %YMM4, %YMM2 > > > - vpminub %YMM7, %YMM6, %YMM3 > > > - vpminub %YMM2, %YMM3, %YMM2 > > > - /* If K7 != 0, there is a null byte. */ > > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > > - kmovd %k7, %edx > > > -# ifdef USE_AS_STRNCPY > > > - sub $(VEC_SIZE * 4), %r8 > > > - jbe L(UnalignedLeaveCase2OrCase3) > > > + .section SECTION(.text), "ax", @progbits > > > +ENTRY(STRCPY) > > > +# ifdef USE_AS_STRCAT > > > + movq %rdi, %rax > > > +# include "strcat-strlen-evex.S" > > > # endif > > > - test %edx, %edx > > > - jnz L(UnalignedFourVecSizeLeave) > > > - > > > -L(UnalignedFourVecSizeLoop_start): > > > - add $(VEC_SIZE * 4), %rdi > > > - add $(VEC_SIZE * 4), %rsi > > > - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) > > > - VMOVA (%rsi), %YMM4 > > > - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) > > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > > - vpminub %YMM5, %YMM4, %YMM2 > > > - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) > > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > > - VMOVU %YMM7, -VEC_SIZE(%rdi) > > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > > - vpminub %YMM7, %YMM6, %YMM3 > > > - vpminub %YMM2, %YMM3, %YMM2 > > > - /* If K7 != 0, there is a null byte. */ > > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > > - kmovd %k7, %edx > > > -# ifdef USE_AS_STRNCPY > > > - sub $(VEC_SIZE * 4), %r8 > > > - jbe L(UnalignedLeaveCase2OrCase3) > > > + > > > + movl %esi, %PAGE_ALIGN_REG > > > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG > > > + ja L(page_cross) > > > +L(page_cross_continue): > > > + VMOVU (%rsi), %VMM(0) > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > + movq %rdi, %rax > > > # endif > > > - test %edx, %edx > > > - jz L(UnalignedFourVecSizeLoop_start) > > > > > > -L(UnalignedFourVecSizeLeave): > > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - test %edx, %edx > > > - jnz L(CopyVecSizeUnaligned_0) > > > > > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > > - kmovd %k2, %ecx > > > - test %ecx, %ecx > > > - jnz L(CopyVecSizeUnaligned_16) > > > + /* Two short string implementations. One with traditional > > > + branching approach and one with masked instructions (which > > > + have potential for dramatically bad perf if dst splits a > > > + page and is not in the TLB). */ > > > +# if USE_EVEX_MASKED_STORE > > > + VPTEST %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > +# ifdef USE_AS_WCSCPY > > > + subl $((1 << CHAR_PER_VEC)- 1), %VRCX > > > +# else > > > + inc %VRCX > > > +# endif > > > + jz L(more_1x_vec) > > > + KMOV %VRCX, %k1 > > > + KXOR %k0, %k1, %k1 > > > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - test %edx, %edx > > > - jnz L(CopyVecSizeUnaligned_32) > > > - > > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > > - kmovd %k4, %ecx > > > - bsf %ecx, %edx > > > - VMOVU %YMM4, (%rdi) > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > -# ifdef USE_AS_STPCPY > > > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax > > > -# endif > > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > > - add $(VEC_SIZE - 1), %r8 > > > - sub %rdx, %r8 > > > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi > > > - jmp L(StrncpyFillTailWithZero) > > > -# else > > > - add $(VEC_SIZE * 3), %rsi > > > - add $(VEC_SIZE * 3), %rdi > > > - jmp L(CopyVecSizeExit) > > > -# endif > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > > > > -/* If source address alignment == destination address alignment */ > > > +# ifdef USE_AS_STPCPY > > > + bsf %VRCX, %VRCX > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > > +# endif > > > + ret > > > > > > -L(SourceStringAlignmentLessTwoVecSize): > > > - VMOVU (%rsi), %YMM3 > > > - VMOVU VEC_SIZE(%rsi), %YMM2 > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > +# else > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jz L(more_1x_vec) > > > > > > -# ifdef USE_AS_STRNCPY > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > - cmp $VEC_SIZE, %r8 > > > + xorl %edx, %edx > > > + bsf %VRCX, %VRDX > > > +# ifdef USE_AS_STPCPY > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# endif > > > + > > > + /* Use mask bits in rcx to detect which copy we need. If the low > > > + mask is zero then there must be a bit set in the upper half. > > > + I.e if rcx != 0 and ecx == 0, then match must be upper 32 > > > + bits so we use L(copy_32_63). */ > > > +# if VEC_SIZE == 64 > > > +# ifdef USE_AS_WCSCPY > > > + testb %cl, %cl > > > +# else > > > + testl %ecx, %ecx > > > +# endif > > > + jz L(copy_32_63) > > > +# endif > > > + > > > +# ifdef USE_AS_WCSCPY > > > + testb $0xf, %cl > > > # else > > > - cmp $(VEC_SIZE + 1), %r8 > > > + testw %cx, %cx > > > # endif > > > - jbe L(CopyVecSizeTail1Case2OrCase3) > > > -# endif > > > - test %edx, %edx > > > - jnz L(CopyVecSizeTail1) > > > + jz L(copy_16_31) > > > > > > - VMOVU %YMM3, (%rdi) > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > - kmovd %k0, %edx > > > > > > -# ifdef USE_AS_STRNCPY > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > - cmp $(VEC_SIZE * 2), %r8 > > > +# ifdef USE_AS_WCSCPY > > > + testb $0x3, %cl > > > # else > > > - cmp $((VEC_SIZE * 2) + 1), %r8 > > > + testb %cl, %cl > > > # endif > > > - jbe L(CopyTwoVecSize1Case2OrCase3) > > > -# endif > > > - test %edx, %edx > > > - jnz L(CopyTwoVecSize1) > > > - > > > - and $-VEC_SIZE, %rsi > > > - and $(VEC_SIZE - 1), %ecx > > > - jmp L(UnalignVecSizeBoth) > > > + jz L(copy_8_15) > > > > > > -/*------End of main part with loops---------------------*/ > > > > > > -/* Case1 */ > > > +# ifdef USE_AS_WCSCPY > > > + vmovd %VMM_128(0), (%rdi) > > > + /* No need to copy, we know its zero. */ > > > + movl $0, (%END_REG) > > > > > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) > > > - .p2align 4 > > > -L(CopyVecSize): > > > - add %rcx, %rdi > > > -# endif > > > -L(CopyVecSizeTail): > > > - add %rcx, %rsi > > > -L(CopyVecSizeTail1): > > > - bsf %edx, %edx > > > -L(CopyVecSizeExit): > > > - cmp $32, %edx > > > - jae L(Exit32_63) > > > - cmp $16, %edx > > > - jae L(Exit16_31) > > > - cmp $8, %edx > > > - jae L(Exit8_15) > > > - cmp $4, %edx > > > - jae L(Exit4_7) > > > - cmp $3, %edx > > > - je L(Exit3) > > > - cmp $1, %edx > > > - ja L(Exit2) > > > - je L(Exit1) > > > - movb $0, (%rdi) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub $1, %r8 > > > - lea 1(%rdi), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > -# endif > > > ret > > > +# else > > > > > > - .p2align 4 > > > -L(CopyTwoVecSize1): > > > - add $VEC_SIZE, %rsi > > > - add $VEC_SIZE, %rdi > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub $VEC_SIZE, %r8 > > > -# endif > > > - jmp L(CopyVecSizeTail1) > > > - > > > - .p2align 4 > > > -L(CopyTwoVecSize): > > > - bsf %edx, %edx > > > - add %rcx, %rsi > > > - add $VEC_SIZE, %edx > > > - sub %ecx, %edx > > > - jmp L(CopyVecSizeExit) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeUnaligned_0): > > > - bsf %edx, %edx > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > -# endif > > > - VMOVU %YMM4, (%rdi) > > > - add $((VEC_SIZE * 4) - 1), %r8 > > > - sub %rdx, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - jmp L(StrncpyFillTailWithZero) > > > -# else > > > - jmp L(CopyVecSizeExit) > > > -# endif > > > + testb $0x7, %cl > > > + jz L(copy_4_7) > > > > > > - .p2align 4 > > > -L(CopyVecSizeUnaligned_16): > > > - bsf %ecx, %edx > > > - VMOVU %YMM4, (%rdi) > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > -# ifdef USE_AS_STPCPY > > > - lea VEC_SIZE(%rdi, %rdx), %rax > > > -# endif > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > - add $((VEC_SIZE * 3) - 1), %r8 > > > - sub %rdx, %r8 > > > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi > > > - jmp L(StrncpyFillTailWithZero) > > > -# else > > > - add $VEC_SIZE, %rsi > > > - add $VEC_SIZE, %rdi > > > - jmp L(CopyVecSizeExit) > > > -# endif > > > > > > - .p2align 4 > > > -L(CopyVecSizeUnaligned_32): > > > - bsf %edx, %edx > > > - VMOVU %YMM4, (%rdi) > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > -# ifdef USE_AS_STPCPY > > > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax > > > -# endif > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > - add $((VEC_SIZE * 2) - 1), %r8 > > > - sub %rdx, %r8 > > > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi > > > - jmp L(StrncpyFillTailWithZero) > > > -# else > > > - add $(VEC_SIZE * 2), %rsi > > > - add $(VEC_SIZE * 2), %rdi > > > - jmp L(CopyVecSizeExit) > > > -# endif > > > + test %edx, %edx > > > + jz L(set_null_term) > > > > > > -# ifdef USE_AS_STRNCPY > > > -# ifndef USE_AS_STRCAT > > > - .p2align 4 > > > -L(CopyVecSizeUnalignedVec6): > > > - VMOVU %YMM6, (%rdi, %rcx) > > > - jmp L(CopyVecSizeVecExit) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeUnalignedVec5): > > > - VMOVU %YMM5, (%rdi, %rcx) > > > - jmp L(CopyVecSizeVecExit) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeUnalignedVec4): > > > - VMOVU %YMM4, (%rdi, %rcx) > > > - jmp L(CopyVecSizeVecExit) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeUnalignedVec3): > > > - VMOVU %YMM3, (%rdi, %rcx) > > > - jmp L(CopyVecSizeVecExit) > > > + /* NB: make this `vmovw` if support for AVX512-FP16 is added. > > > + */ > > > + vmovd %VMM_128(0), %esi > > > + movw %si, (%rdi) > > > + > > > + .p2align 4,, 1 > > > +L(set_null_term): > > > + /* No need to copy, we know its zero. */ > > > + movb $0, (%END_REG) > > > + ret > > > # endif > > > > > > -/* Case2 */ > > > - > > > - .p2align 4 > > > -L(CopyVecSizeCase2): > > > - add $VEC_SIZE, %r8 > > > - add %rcx, %rdi > > > - add %rcx, %rsi > > > - bsf %edx, %edx > > > - cmp %r8d, %edx > > > - jb L(CopyVecSizeExit) > > > - jmp L(StrncpyExit) > > > - > > > - .p2align 4 > > > -L(CopyTwoVecSizeCase2): > > > - add %rcx, %rsi > > > - bsf %edx, %edx > > > - add $VEC_SIZE, %edx > > > - sub %ecx, %edx > > > - cmp %r8d, %edx > > > - jb L(CopyVecSizeExit) > > > - jmp L(StrncpyExit) > > > - > > > -L(CopyVecSizeTailCase2): > > > - add %rcx, %rsi > > > - bsf %edx, %edx > > > - cmp %r8d, %edx > > > - jb L(CopyVecSizeExit) > > > - jmp L(StrncpyExit) > > > - > > > -L(CopyVecSizeTail1Case2): > > > - bsf %edx, %edx > > > - cmp %r8d, %edx > > > - jb L(CopyVecSizeExit) > > > - jmp L(StrncpyExit) > > > - > > > -/* Case2 or Case3, Case3 */ > > > - > > > - .p2align 4 > > > -L(CopyVecSizeCase2OrCase3): > > > - test %rdx, %rdx > > > - jnz L(CopyVecSizeCase2) > > > -L(CopyVecSizeCase3): > > > - add $VEC_SIZE, %r8 > > > - add %rcx, %rdi > > > - add %rcx, %rsi > > > - jmp L(StrncpyExit) > > > - > > > - .p2align 4 > > > -L(CopyTwoVecSizeCase2OrCase3): > > > - test %rdx, %rdx > > > - jnz L(CopyTwoVecSizeCase2) > > > - add %rcx, %rsi > > > - jmp L(StrncpyExit) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeTailCase2OrCase3): > > > - test %rdx, %rdx > > > - jnz L(CopyVecSizeTailCase2) > > > - add %rcx, %rsi > > > - jmp L(StrncpyExit) > > > - > > > - .p2align 4 > > > -L(CopyTwoVecSize1Case2OrCase3): > > > - add $VEC_SIZE, %rdi > > > - add $VEC_SIZE, %rsi > > > - sub $VEC_SIZE, %r8 > > > -L(CopyVecSizeTail1Case2OrCase3): > > > - test %rdx, %rdx > > > - jnz L(CopyVecSizeTail1Case2) > > > - jmp L(StrncpyExit) > > > +# if VEC_SIZE == 64 > > > + .p2align 4,, 6 > > > +L(copy_32_63): > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > + VMOVU %VMM_256(0), (%rdi) > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > > + ret > > > +# endif > > > + > > > + > > > + .p2align 4,, 6 > > > +L(copy_16_31): > > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > > + and will save code size. */ > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > + VMOVU %VMM_128(0), (%rdi) > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(copy_8_15): > > > +# ifdef USE_AS_WCSCPY > > > + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > > +# else > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx > > > +# endif > > > + vmovq %VMM_128(0), (%rdi) > > > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) > > > + ret > > > # endif > > > > > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ > > > > > > - .p2align 4 > > > -L(Exit1): > > > - movzwl (%rsi), %edx > > > - mov %dx, (%rdi) > > > -# ifdef USE_AS_STPCPY > > > - lea 1(%rdi), %rax > > > +# ifndef USE_AS_WCSCPY > > > + .p2align 4,, 12 > > > +L(copy_4_7): > > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > > + vmovd %VMM_128(0), (%rdi) > > > + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) > > > + ret > > > # endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub $2, %r8 > > > - lea 2(%rdi), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + > > > + > > > + .p2align 4,, 8 > > > +L(more_1x_vec): > > > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > + VMOVU %VMM(0), (%rdi) > > > # endif > > > - ret > > > + subq %rsi, %rdi > > > + andq $-(VEC_SIZE), %rsi > > > + addq %rsi, %rdi > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > > > > - .p2align 4 > > > -L(Exit2): > > > - movzwl (%rsi), %ecx > > > - mov %cx, (%rdi) > > > - movb $0, 2(%rdi) > > > + /* Ideally we store after moves to minimize impact of potential > > > + false-dependencies. */ > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > + VMOVU %VMM(0), (%rax) > > > +# endif > > > + > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x1) > > > + > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > + VMOVU %VMM(1), VEC_SIZE(%rdi) > > > + > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x2) > > > + > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > + > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > + KMOV %k0, %VRDX > > > + test %VRDX, %VRDX > > > + jnz L(ret_vec_x3) > > > + > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x4) > > > + > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > + > > > + > > > + /* Align for 4x loop. */ > > > + subq %rsi, %rdi > > > + > > > + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE > > > + we covered before aligning. */ > > > + subq $-(VEC_SIZE * 5), %rsi > > > + andq $-(VEC_SIZE * 4), %rsi > > > + > > > + > > > + /* Load first half of the loop before entry. */ > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + KORTEST %k2, %k4 > > > + jnz L(loop_4x_done) > > > + > > > + .p2align 4,, 11 > > > +L(loop_4x_vec): > > > + > > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > + > > > + subq $(VEC_SIZE * -4), %rsi > > > + > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > + > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + KORTEST %k2, %k4 > > > + jz L(loop_4x_vec) > > > + > > > +L(loop_4x_done): > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + /* Restore rdi (%rdi). */ > > > + addq %rsi, %rdi > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x0_end) > > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > > > + > > > + KMOV %k2, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x1) > > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > > > + > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x2) > > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > > > + /* Place L(ret_vec_x4) here to save code size. We get a > > > + meaningfuly benefit doing this for stpcpy. */ > > > + KMOV %k4, %VRDX > > > +L(ret_vec_x3): > > > + bsf %VRDX, %VRDX > > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > # ifdef USE_AS_STPCPY > > > - lea 2(%rdi), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub $3, %r8 > > > - lea 3(%rdi), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax > > > # endif > > > +L(return_end): > > > ret > > > > > > - .p2align 4 > > > -L(Exit3): > > > - mov (%rsi), %edx > > > - mov %edx, (%rdi) > > > + .p2align 4,, 6 > > > +L(ret_vec_x0_end): > > > + bsf %VRCX, %VRCX > > > # ifdef USE_AS_STPCPY > > > - lea 3(%rdi), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub $4, %r8 > > > - lea 4(%rdi), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > > # endif > > > + inc %VRCX > > > + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > ret > > > > > > - .p2align 4 > > > -L(Exit4_7): > > > - mov (%rsi), %ecx > > > - mov %ecx, (%rdi) > > > - mov -3(%rsi, %rdx), %ecx > > > - mov %ecx, -3(%rdi, %rdx) > > > + .p2align 4,, 8 > > > +L(ret_vec_x1): > > > + bsf %VRCX, %VRCX > > > + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > # ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub %rdx, %r8 > > > - sub $1, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > > # endif > > > ret > > > > > > - .p2align 4 > > > -L(Exit8_15): > > > - mov (%rsi), %rcx > > > - mov -7(%rsi, %rdx), %r9 > > > - mov %rcx, (%rdi) > > > - mov %r9, -7(%rdi, %rdx) > > > + .p2align 4,, 4 > > > +L(ret_vec_x2): > > > + bsf %VRCX, %VRCX > > > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > # ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub %rdx, %r8 > > > - sub $1, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > > > # endif > > > ret > > > > > > - .p2align 4 > > > -L(Exit16_31): > > > - VMOVU (%rsi), %XMM2 > > > - VMOVU -15(%rsi, %rdx), %XMM3 > > > - VMOVU %XMM2, (%rdi) > > > - VMOVU %XMM3, -15(%rdi, %rdx) > > > + /* ret_vec_x3 reuses return code after the loop. */ > > > + .p2align 4,, 6 > > > +L(ret_vec_x4): > > > + bsf %VRCX, %VRCX > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > # ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > -# endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub %rdx, %r8 > > > - sub $1, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > > > # endif > > > ret > > > > > > - .p2align 4 > > > -L(Exit32_63): > > > - VMOVU (%rsi), %YMM2 > > > - VMOVU -31(%rsi, %rdx), %YMM3 > > > - VMOVU %YMM2, (%rdi) > > > - VMOVU %YMM3, -31(%rdi, %rdx) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > + > > > + .p2align 4,, 4 > > > +L(page_cross): > > > +# ifndef USE_AS_STRCAT > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > # endif > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > - sub %rdx, %r8 > > > - sub $1, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - jnz L(StrncpyFillTailWithZero) > > > + movq %rsi, %rcx > > > + andq $(VEC_SIZE * -1), %rcx > > > + > > > + VPCMPEQ (%rcx), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > +# ifdef USE_AS_WCSCPY > > > + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG > > > + shrl $2, %PAGE_ALIGN_REG > > > # endif > > > - ret > > > + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX > > > > > > -# ifdef USE_AS_STRNCPY > > > +# if USE_MOVSB_IN_PAGE_CROSS > > > + /* Optimizing more aggressively for space as this is very cold > > > + code. This saves 2x cache lines. */ > > > > > > - .p2align 4 > > > -L(StrncpyExit1): > > > - movzbl (%rsi), %edx > > > - mov %dl, (%rdi) > > > -# ifdef USE_AS_STPCPY > > > - lea 1(%rdi), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, 1(%rdi) > > > + /* This adds once to the later result which will get correct > > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > > + to be in the page cross case rsi cannot be aligned and we > > > + already right-shift rcx by the misalignment. */ > > > + shl %VRCX > > > + jz L(page_cross_continue) > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > + movq %rdi, %rax > > > # endif > > > - ret > > > + bsf %VRCX, %VRCX > > > + REP_MOVS > > > > > > - .p2align 4 > > > -L(StrncpyExit2): > > > - movzwl (%rsi), %edx > > > - mov %dx, (%rdi) > > > # ifdef USE_AS_STPCPY > > > - lea 2(%rdi), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, 2(%rdi) > > > + leaq -CHAR_SIZE(%rdi), %rax > > > # endif > > > ret > > > > > > - .p2align 4 > > > -L(StrncpyExit3_4): > > > - movzwl (%rsi), %ecx > > > - movzwl -2(%rsi, %r8), %edx > > > - mov %cx, (%rdi) > > > - mov %dx, -2(%rdi, %r8) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %r8), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi, %r8) > > > -# endif > > > - ret > > > > > > - .p2align 4 > > > -L(StrncpyExit5_8): > > > - mov (%rsi), %ecx > > > - mov -4(%rsi, %r8), %edx > > > - mov %ecx, (%rdi) > > > - mov %edx, -4(%rdi, %r8) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %r8), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi, %r8) > > > -# endif > > > - ret > > > +# else > > > + /* Check if we found zero-char before end of page. */ > > > + test %VRCX, %VRCX > > > + jz L(page_cross_continue) > > > > > > - .p2align 4 > > > -L(StrncpyExit9_16): > > > - mov (%rsi), %rcx > > > - mov -8(%rsi, %r8), %rdx > > > - mov %rcx, (%rdi) > > > - mov %rdx, -8(%rdi, %r8) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %r8), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi, %r8) > > > -# endif > > > - ret > > > + /* Traditional copy case, essentially same as used in non-page- > > > + cross case but since we can't reuse VMM(0) we need twice as > > > + many loads from rsi. */ > > > > > > - .p2align 4 > > > -L(StrncpyExit17_32): > > > - VMOVU (%rsi), %XMM2 > > > - VMOVU -16(%rsi, %r8), %XMM3 > > > - VMOVU %XMM2, (%rdi) > > > - VMOVU %XMM3, -16(%rdi, %r8) > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %r8), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi, %r8) > > > +# ifndef USE_AS_STRCAT > > > + xorl %edx, %edx > > > # endif > > > - ret > > > - > > > - .p2align 4 > > > -L(StrncpyExit33_64): > > > - /* 0/32, 31/16 */ > > > - VMOVU (%rsi), %YMM2 > > > - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 > > > - VMOVU %YMM2, (%rdi) > > > - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) > > > + /* Dependency on rdi must already have been satisfied. */ > > > + bsf %VRCX, %VRDX > > > # ifdef USE_AS_STPCPY > > > - lea (%rdi, %r8), %rax > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# elif !defined USE_AS_STRCAT > > > + movq %rdi, %rax > > > # endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi, %r8) > > > -# endif > > > - ret > > > > > > - .p2align 4 > > > -L(StrncpyExit65): > > > - /* 0/32, 32/32, 64/1 */ > > > - VMOVU (%rsi), %YMM2 > > > - VMOVU 32(%rsi), %YMM3 > > > - mov 64(%rsi), %cl > > > - VMOVU %YMM2, (%rdi) > > > - VMOVU %YMM3, 32(%rdi) > > > - mov %cl, 64(%rdi) > > > -# ifdef USE_AS_STPCPY > > > - lea 65(%rdi), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, 65(%rdi) > > > +# if VEC_SIZE == 64 > > > +# ifdef USE_AS_WCSCPY > > > + testb %cl, %cl > > > +# else > > > + test %ecx, %ecx > > > +# endif > > > + jz L(page_cross_copy_32_63) > > > # endif > > > - ret > > > - > > > -# ifndef USE_AS_STRCAT > > > > > > - .p2align 4 > > > -L(Fill1): > > > - mov %dl, (%rdi) > > > - ret > > > +# ifdef USE_AS_WCSCPY > > > + testb $0xf, %cl > > > +# else > > > + testw %cx, %cx > > > +# endif > > > + jz L(page_cross_copy_16_31) > > > > > > - .p2align 4 > > > -L(Fill2): > > > - mov %dx, (%rdi) > > > - ret > > > +# ifdef USE_AS_WCSCPY > > > + testb $0x3, %cl > > > +# else > > > + testb %cl, %cl > > > +# endif > > > + jz L(page_cross_copy_8_15) > > > > > > - .p2align 4 > > > -L(Fill3_4): > > > - mov %dx, (%rdi) > > > - mov %dx, -2(%rdi, %r8) > > > +# ifdef USE_AS_WCSCPY > > > + movl (%rsi), %esi > > > + movl %esi, (%rdi) > > > + movl $0, (%END_REG) > > > ret > > > +# else > > > > > > - .p2align 4 > > > -L(Fill5_8): > > > - mov %edx, (%rdi) > > > - mov %edx, -4(%rdi, %r8) > > > - ret > > > + testb $0x7, %cl > > > + jz L(page_cross_copy_4_7) > > > > > > - .p2align 4 > > > -L(Fill9_16): > > > - mov %rdx, (%rdi) > > > - mov %rdx, -8(%rdi, %r8) > > > + test %edx, %edx > > > + jz L(page_cross_set_null_term) > > > + movzwl (%rsi), %ecx > > > + movw %cx, (%rdi) > > > +L(page_cross_set_null_term): > > > + movb $0, (%END_REG) > > > ret > > > > > > - .p2align 4 > > > -L(Fill17_32): > > > - VMOVU %XMMZERO, (%rdi) > > > - VMOVU %XMMZERO, -16(%rdi, %r8) > > > - ret > > > > > > - .p2align 4 > > > -L(CopyVecSizeUnalignedVec2): > > > - VMOVU %YMM2, (%rdi, %rcx) > > > - > > > - .p2align 4 > > > -L(CopyVecSizeVecExit): > > > - bsf %edx, %edx > > > - add $(VEC_SIZE - 1), %r8 > > > - add %rcx, %rdi > > > -# ifdef USE_AS_STPCPY > > > - lea (%rdi, %rdx), %rax > > > -# endif > > > - sub %rdx, %r8 > > > - lea 1(%rdi, %rdx), %rdi > > > - > > > - .p2align 4 > > > -L(StrncpyFillTailWithZero): > > > - xor %edx, %edx > > > - sub $VEC_SIZE, %r8 > > > - jbe L(StrncpyFillExit) > > > - > > > - VMOVU %YMMZERO, (%rdi) > > > - add $VEC_SIZE, %rdi > > > - > > > - mov %rdi, %rsi > > > - and $(VEC_SIZE - 1), %esi > > > - sub %rsi, %rdi > > > - add %rsi, %r8 > > > - sub $(VEC_SIZE * 4), %r8 > > > - jb L(StrncpyFillLessFourVecSize) > > > - > > > -L(StrncpyFillLoopVmovdqa): > > > - VMOVA %YMMZERO, (%rdi) > > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > > - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) > > > - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) > > > - add $(VEC_SIZE * 4), %rdi > > > - sub $(VEC_SIZE * 4), %r8 > > > - jae L(StrncpyFillLoopVmovdqa) > > > - > > > -L(StrncpyFillLessFourVecSize): > > > - add $(VEC_SIZE * 2), %r8 > > > - jl L(StrncpyFillLessTwoVecSize) > > > - VMOVA %YMMZERO, (%rdi) > > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > > - add $(VEC_SIZE * 2), %rdi > > > - sub $VEC_SIZE, %r8 > > > - jl L(StrncpyFillExit) > > > - VMOVA %YMMZERO, (%rdi) > > > - add $VEC_SIZE, %rdi > > > - jmp L(Fill) > > > - > > > - .p2align 4 > > > -L(StrncpyFillLessTwoVecSize): > > > - add $VEC_SIZE, %r8 > > > - jl L(StrncpyFillExit) > > > - VMOVA %YMMZERO, (%rdi) > > > - add $VEC_SIZE, %rdi > > > - jmp L(Fill) > > > - > > > - .p2align 4 > > > -L(StrncpyFillExit): > > > - add $VEC_SIZE, %r8 > > > -L(Fill): > > > - cmp $17, %r8d > > > - jae L(Fill17_32) > > > - cmp $9, %r8d > > > - jae L(Fill9_16) > > > - cmp $5, %r8d > > > - jae L(Fill5_8) > > > - cmp $3, %r8d > > > - jae L(Fill3_4) > > > - cmp $1, %r8d > > > - ja L(Fill2) > > > - je L(Fill1) > > > + .p2align 4,, 4 > > > +L(page_cross_copy_4_7): > > > + movl (%rsi), %ecx > > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi > > > + movl %ecx, (%rdi) > > > + movl %esi, -(4 - CHAR_SIZE)(%END_REG) > > > ret > > > - > > > -/* end of ifndef USE_AS_STRCAT */ > > > # endif > > > > > > - .p2align 4 > > > -L(UnalignedLeaveCase2OrCase3): > > > - test %rdx, %rdx > > > - jnz L(UnalignedFourVecSizeLeaveCase2) > > > -L(UnalignedFourVecSizeLeaveCase3): > > > - lea (VEC_SIZE * 4)(%r8), %rcx > > > - and $-VEC_SIZE, %rcx > > > - add $(VEC_SIZE * 3), %r8 > > > - jl L(CopyVecSizeCase3) > > > - VMOVU %YMM4, (%rdi) > > > - sub $VEC_SIZE, %r8 > > > - jb L(CopyVecSizeCase3) > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > - sub $VEC_SIZE, %r8 > > > - jb L(CopyVecSizeCase3) > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > - sub $VEC_SIZE, %r8 > > > - jb L(CopyVecSizeCase3) > > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > > -# ifdef USE_AS_STPCPY > > > - lea (VEC_SIZE * 4)(%rdi), %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (VEC_SIZE * 4)(%rdi) > > > -# endif > > > +# if VEC_SIZE == 64 > > > + .p2align 4,, 4 > > > +L(page_cross_copy_32_63): > > > + VMOVU (%rsi), %VMM_256(0) > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > + VMOVU %VMM_256(0), (%rdi) > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > > ret > > > - > > > - .p2align 4 > > > -L(UnalignedFourVecSizeLeaveCase2): > > > - xor %ecx, %ecx > > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > > - kmovd %k1, %edx > > > - add $(VEC_SIZE * 3), %r8 > > > - jle L(CopyVecSizeCase2OrCase3) > > > - test %edx, %edx > > > -# ifndef USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec4) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > > - kmovd %k2, %edx > > > - VMOVU %YMM4, (%rdi) > > > - add $VEC_SIZE, %rcx > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > - test %edx, %edx > > > -# ifndef USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec5) > > > -# else > > > - jnz L(CopyVecSize) > > > # endif > > > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > > - kmovd %k3, %edx > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > - add $VEC_SIZE, %rcx > > > - sub $VEC_SIZE, %r8 > > > - jbe L(CopyVecSizeCase2OrCase3) > > > - test %edx, %edx > > > -# ifndef USE_AS_STRCAT > > > - jnz L(CopyVecSizeUnalignedVec6) > > > -# else > > > - jnz L(CopyVecSize) > > > -# endif > > > - > > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > > - kmovd %k4, %edx > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > - lea VEC_SIZE(%rdi, %rcx), %rdi > > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > > - bsf %edx, %edx > > > - cmp %r8d, %edx > > > - jb L(CopyVecSizeExit) > > > -L(StrncpyExit): > > > - cmp $65, %r8d > > > - je L(StrncpyExit65) > > > - cmp $33, %r8d > > > - jae L(StrncpyExit33_64) > > > - cmp $17, %r8d > > > - jae L(StrncpyExit17_32) > > > - cmp $9, %r8d > > > - jae L(StrncpyExit9_16) > > > - cmp $5, %r8d > > > - jae L(StrncpyExit5_8) > > > - cmp $3, %r8d > > > - jae L(StrncpyExit3_4) > > > - cmp $1, %r8d > > > - ja L(StrncpyExit2) > > > - je L(StrncpyExit1) > > > -# ifdef USE_AS_STPCPY > > > - mov %rdi, %rax > > > -# endif > > > -# ifdef USE_AS_STRCAT > > > - movb $0, (%rdi) > > > -# endif > > > + .p2align 4,, 4 > > > +L(page_cross_copy_16_31): > > > + vmovdqu (%rsi), %xmm0 > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > + vmovdqu %xmm0, (%rdi) > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > > ret > > > > > > - .p2align 4 > > > -L(ExitZero): > > > -# ifndef USE_AS_STRCAT > > > - mov %rdi, %rax > > > -# endif > > > + .p2align 4,, 4 > > > +L(page_cross_copy_8_15): > > > + movq (%rsi), %rcx > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > > + movq %rcx, (%rdi) > > > + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) > > > ret > > > - > > > -# endif > > > - > > > -# ifndef USE_AS_STRCAT > > > -END (STRCPY) > > > -# else > > > -END (STRCAT) > > > # endif > > > +END(STRCPY) > > > #endif > > > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S > > > index 203a19bf21..d648ba5cfe 100644 > > > --- a/sysdeps/x86_64/multiarch/strncat-evex.S > > > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S > > > @@ -1,7 +1,520 @@ > > > -#ifndef STRNCAT > > > -# define STRNCAT __strncat_evex > > > -#endif > > > +/* {wcs|str}ncat with 256/512-bit EVEX. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > + moment. */ > > > +# define USE_EVEX_MASKED_STORE 0 > > > + > > > +# include <sysdep.h> > > > + > > > +# ifndef VEC_SIZE > > > +# include "x86-evex256-vecs.h" > > > +# endif > > > + > > > +# ifndef STRNCAT > > > +# define STRNCAT __strncat_evex > > > +# endif > > > + > > > + > > > +# ifdef USE_AS_WCSCPY > > > +# define movNULL movl > > > +# define VMOVU_MASK vmovdqu32 > > > +# define VPMIN vpminud > > > +# define VPTESTN vptestnmd > > > +# define VPTEST vptestmd > > > +# define VPCMPEQ vpcmpeqd > > > +# define CHAR_SIZE 4 > > > + > > > +# define REP_MOVS rep movsd > > > + > > > +# define VMASK_REG VR10 > > > +# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst > > > + > > > +# define USE_WIDE_CHAR > > > +# else > > > +# define movNULL movb > > > +# define VMOVU_MASK vmovdqu8 > > > +# define VPMIN vpminub > > > +# define VPTESTN vptestnmb > > > +# define VPTEST vptestmb > > > +# define VPCMPEQ vpcmpeqb > > > +# define CHAR_SIZE 1 > > > + > > > +# define REP_MOVS rep movsb > > > + > > > +# define VMASK_REG VRCX > > > +# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst > > > + > > > +# endif > > > + > > > +# include "strncpy-or-cat-overflow-def.h" > > > + > > > +# include "reg-macros.h" > > > + > > > + > > > +# define VZERO VMM(7) > > > +# define VZERO_128 VMM_128(7) > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > + > > > + .section SECTION(.text), "ax", @progbits > > > +ENTRY(STRNCAT) > > > + movq %rdi, %rax > > > + > > > + /* NB: It's safe to filter out zero-length strings WITHOUT > > > + setting null-term. Destination MUST be a null-terminated > > > + string so essentially the work is already done. */ > > > +# ifdef USE_AS_WCSCPY > > > + leaq -1(%rdx), %rcx > > > + shrq $56, %rcx > > > + jnz L(zero_len) > > > +# else > > > + test %rdx, %rdx > > > + jle L(zero_len) > > > +# endif > > > + > > > +# include "strcat-strlen-evex.S" > > > + > > > + movl %esi, %ecx > > > + andl $(PAGE_SIZE - 1), %ecx > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > + ja L(page_cross) > > > +L(page_cross_continue): > > > + VMOVU (%rsi), %VMM(0) > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + > > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > > + <= CHAR_PER_VEC with masked instructions (which have > > > + potential for dramatically bad perf if dst splits a page and > > > + is not in the TLB). */ > > > +# if USE_EVEX_MASKED_STORE > > > + KMOV %k0, %VRCX > > > + FIND_FIRST_ONE (VRCX, VR8) > > > + cmpq %r8, %rdx > > > + jbe L(less_1x_vec) > > > + > > > + test %VRCX, %VRCX > > > + jz L(more_1x_vec) > > > + > > > + blsmsk %VRCX, %VRCX > > > + KMOV %VRCX, %k1 > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > + ret > > > + > > > +L(less_1x_vec): > > > + mov $-1, %VRCX > > > + bzhi %VRDX, %VRCX, %VRCX > > > + KMOV %VRCX, %k1 > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > + > > > + ret > > > +# else > > > + KMOV %k0, %VMASK_REG > > > + /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf > > > + %VMASK_REG, %VRCX` for wcsncat. */ > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > + cmpq %rcx, %rdx > > > + jbe L(less_1x_vec) > > > + > > > + /* If there were no zero-CHARs (rcx was zero before > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > + cmpl $CHAR_PER_VEC, %ecx > > > + je L(more_1x_vec) > > > + > > > + movl %ecx, %edx > > > + > > > +L(less_1x_vec): > > > +# if VEC_SIZE == 64 > > > + cmpl $(32 / CHAR_SIZE), %edx > > > + jae L(copy_32_63) > > > +# endif > > > + > > > + cmpl $(16 / CHAR_SIZE), %edx > > > + jae L(copy_16_31) > > > + > > > + > > > + cmpl $(8 / CHAR_SIZE), %edx > > > + jae L(copy_8_15) > > > + > > > +# ifdef USE_AS_WCSCPY > > > + vmovd %VMM_128(0), (%rdi) > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > +# else > > > + > > > + cmpl $4, %edx > > > + jae L(copy_4_7) > > > + > > > + movzbl (%rsi), %ecx > > > + cmpl $1, %edx > > > + jbe L(set_null_term) > > > + > > > + movzwl 1(%rsi), %esi > > > + movw %si, 1(%rdi) > > > + > > > + .p2align 4,, 1 > > > +L(set_null_term): > > > + movb %cl, (%rdi) > > > + movNULL $0, (%rdi, %rdx) > > > + ret > > > +# endif > > > + > > > +# if VEC_SIZE == 64 > > > + .p2align 4,, 6 > > > +L(copy_32_63): > > > + VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > + VMOVU %VMM_256(0), (%rdi) > > > + VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > +# endif > > > + .p2align 4,, 6 > > > +L(copy_16_31): > > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > > + and will save code size. */ > > > + vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > + VMOVU %VMM_128(0), (%rdi) > > > + vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > + > > > + .p2align 4,, 2 > > > +L(copy_8_15): > > > + movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx > > > + vmovq %VMM_128(0), (%rdi) > > > + movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > + > > > +# ifndef USE_AS_WCSCPY > > > + .p2align 4,, 12 > > > +L(copy_4_7): > > > + movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx > > > + vmovd %VMM_128(0), (%rdi) > > > + movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > +# endif > > > + > > > +# endif > > > + .p2align 4,, 4 > > > +L(zero_len): > > > +# ifdef USE_AS_WCSCPY > > > + test %rdx, %rdx > > > +# endif > > > + jne OVERFLOW_STRCAT > > > + ret > > > > > > -#define USE_AS_STRNCAT > > > -#define STRCAT STRNCAT > > > -#include "strcat-evex.S" > > > + .p2align 4,, 8 > > > +L(more_1x_vec): > > > + VMOVU %VMM(0), (%rdi) > > > + > > > + /* We are going to align rsi here so will need to be able to re- > > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > > + > > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > > + subq %rsi, %rdi > > > + andq $-(VEC_SIZE), %rsi > > > +L(loop_last_4x_vec): > > > + addq %rsi, %rdi > > > + subq %rsi, %rdx > > > +# ifdef USE_AS_WCSCPY > > > + shrq $2, %rdx > > > +# endif > > > + > > > + /* Will need this regardless. */ > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > + KMOV %k0, %VMASK_REG > > > + > > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > > + ja L(more_2x_vec) > > > + > > > +L(last_2x_vec): > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > + cmpl %ecx, %edx > > > + jbe L(ret_vec_x1_len) > > > + > > > + /* If there were no zero-CHARs (rcx was zero before > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > + cmpl $CHAR_PER_VEC, %ecx > > > + jne L(ret_vec_x1) > > > + > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + addl $-CHAR_PER_VEC, %edx > > > + bzhi %VRDX, %VRCX, %VR8 > > > + jz L(ret_vec_x2_len) > > > +L(ret_vec_x2): > > > + bsf %VRCX, %VRDX > > > +L(ret_vec_x2_len): > > > + VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > > + VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > + ret > > > + > > > + .p2align 4,, 4 > > > +L(ret_vec_x1_len): > > > + movl %edx, %ecx > > > +L(ret_vec_x1): > > > + VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) > > > + VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > + VZEROUPPER_RETURN > > > + > > > + > > > + .p2align 4,, 8 > > > +L(last_4x_vec): > > > + addl $-(CHAR_PER_VEC * 4), %edx > > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > + KMOV %k0, %VMASK_REG > > > + subq $-(VEC_SIZE * 4), %rsi > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpl $(CHAR_PER_VEC * 2), %edx > > > + jbe L(last_2x_vec) > > > + .p2align 4,, 8 > > > +L(more_2x_vec): > > > +# ifdef USE_AS_WCSCPY > > > + xorl %ecx, %ecx > > > +# endif > > > + bsf %VMASK_REG, %VRCX > > > + jnz L(ret_vec_x1) > > > + > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x2) > > > + > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > + KMOV %k0, %VMASK_REG > > > + > > > + cmpq $(CHAR_PER_VEC * 4), %rdx > > > + ja L(more_4x_vec) > > > + > > > + /* Adjust length before going to L(ret_vec_x3_len) or > > > + L(ret_vec_x3). */ > > > + addl $(CHAR_PER_VEC * -2), %edx > > > + > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > + cmpl %ecx, %edx > > > + jbe L(ret_vec_x3_len) > > > + > > > + /* If there were no zero-CHARs (rcx was zero before > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > + cmpl $CHAR_PER_VEC, %ecx > > > + jne L(ret_vec_x3) > > > + > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > + KMOV %k0, %VRCX > > > + addl $-CHAR_PER_VEC, %edx > > > + bzhi %VRDX, %VRCX, %VR8 > > > + jz L(ret_vec_x4_len) > > > +L(ret_vec_x4): > > > + bsf %VRCX, %VRDX > > > +L(ret_vec_x4_len): > > > + VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) > > > + VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > + ret > > > + > > > + .p2align 4,, 4 > > > +L(ret_vec_x3_len): > > > + movl %edx, %ecx > > > +L(ret_vec_x3): > > > + VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > > + VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(more_4x_vec): > > > +# ifdef USE_AS_WCSCPY > > > + xorl %ecx, %ecx > > > +# endif > > > + bsf %VMASK_REG, %VRCX > > > + jnz L(ret_vec_x3) > > > + > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x4) > > > + > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > + > > > + /* Check if we are near the end before aligning. */ > > > + cmpq $(CHAR_PER_VEC * 8), %rdx > > > + jbe L(last_4x_vec) > > > + > > > + > > > + /* Add rsi to rdx (length) before aligning rsi. NB: Since we > > > + filtered out huge lengths this cannot overflow. */ > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > > +# else > > > + addq %rsi, %rdx > > > +# endif > > > + > > > + /* Subtract rsi from rdi before aligning (add back will have > > > + correct rdi for aligned rsi). */ > > > + subq %rsi, %rdi > > > + subq $-(VEC_SIZE * 5), %rsi > > > + andq $(VEC_SIZE * -4), %rsi > > > + > > > + /* Load first half of the loop before entry. */ > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + > > > + /* Offset rsi by VEC_SIZE so that we can jump to > > > + L(loop_last_4x_vec). */ > > > + addq $-(VEC_SIZE), %rsi > > > + KORTEST %k2, %k4 > > > + jnz L(loop_4x_done) > > > + > > > + /* Store loop end in r9. */ > > > + leaq -(VEC_SIZE * 5)(%rdx), %r9 > > > + > > > + .p2align 4,, 11 > > > +L(loop_4x_vec): > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > > + > > > + subq $(VEC_SIZE * -4), %rsi > > > + cmpq %rsi, %r9 > > > + jbe L(loop_last_4x_vec) > > > + > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + KORTEST %k2, %k4 > > > + jz L(loop_4x_vec) > > > + > > > +L(loop_4x_done): > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + /* Restore rdi (dst). */ > > > + addq %rsi, %rdi > > > + > > > + /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so > > > + test with bsf. */ > > > + bsf %VRCX, %VRCX > > > + jnz L(ret_vec_x1) > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) > > > + > > > + KMOV %k2, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x2) > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > > + > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + bsf %VRCX, %VRCX > > > + jnz L(ret_vec_x3) > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > > + > > > + KMOV %k4, %VRCX > > > + bsf %VRCX, %VRCX > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > + ret > > > + > > > + > > > + .p2align 4,, 4 > > > +L(page_cross): > > > + movq %rsi, %r8 > > > + andq $(VEC_SIZE * -1), %r8 > > > + VPCMPEQ (%r8), %VZERO, %k0 > > > + > > > +# ifdef USE_AS_WCSCPY > > > + KMOV %k0, %VR9 > > > + shrl $2, %ecx > > > + andl $(CHAR_PER_VEC - 1), %ecx > > > + shrx %VRCX, %VR9, %VRCX > > > +# else > > > + KMOV %k0, %VRCX > > > + shrx %VRSI, %VRCX, %VRCX > > > +# endif > > > + > > > + subl %esi, %r8d > > > + andl $(VEC_SIZE - 1), %r8d > > > +# ifdef USE_AS_WCSCPY > > > + shrl $2, %r8d > > > +# endif > > > + cmpq %r8, %rdx > > > + jbe L(page_cross_small) > > > + /* Optimizing more for space as this is very cold code. This > > > + saves 2x cache lines. */ > > > + > > > + /* This adds once to the later result which will get correct > > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > > + to be in the page cross case rsi cannot be aligned and we > > > + already right-shift rcx by the misalignment. */ > > > + shl %VRCX > > > + jz L(page_cross_continue) > > > + bsf %VRCX, %VRCX > > > + REP_MOVS > > > + ret > > > + > > > +L(page_cross_small): > > > + tzcnt %VRCX, %VRCX > > > + jz L(page_cross_setz) > > > + cmpl %edx, %ecx > > > + cmova %edx, %ecx > > > + > > > +# ifdef USE_AS_WCSCPY > > > + rep movsd > > > +# else > > > + rep movsb > > > +# endif > > > +L(page_cross_setz): > > > + movNULL $0, (%rdi) > > > + ret > > > +END(STRNCAT) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S > > > index 1b3426d511..49eaf4cbd9 100644 > > > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S > > > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S > > > @@ -1,7 +1,990 @@ > > > -#ifndef STRNCPY > > > -# define STRNCPY __strncpy_evex > > > -#endif > > > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > + This file is part of the GNU C Library. > > > + > > > + The GNU C Library is free software; you can redistribute it and/or > > > + modify it under the terms of the GNU Lesser General Public > > > + License as published by the Free Software Foundation; either > > > + version 2.1 of the License, or (at your option) any later version. > > > + > > > + The GNU C Library is distributed in the hope that it will be useful, > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > + Lesser General Public License for more details. > > > + > > > + You should have received a copy of the GNU Lesser General Public > > > + License along with the GNU C Library; if not, see > > > + <https://www.gnu.org/licenses/>. */ > > > + > > > +#include <isa-level.h> > > > + > > > +#if ISA_SHOULD_BUILD (4) > > > + > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > + moment. */ > > > +# define USE_EVEX_MASKED_STORE 0 > > > + > > > + > > > +# include <sysdep.h> > > > +# ifndef VEC_SIZE > > > +# include "x86-evex256-vecs.h" > > > +# endif > > > + > > > + > > > +# ifndef STRNCPY > > > +# define STRNCPY __strncpy_evex > > > +# endif > > > + > > > +# ifdef USE_AS_WCSCPY > > > +# define VMOVU_MASK vmovdqu32 > > > +# define VPCMPEQ vpcmpeqd > > > +# define VPMIN vpminud > > > +# define VPTESTN vptestnmd > > > +# define VPTEST vptestmd > > > +# define CHAR_SIZE 4 > > > + > > > +# define REP_MOVS rep movsd > > > +# define REP_STOS rep stosl > > > + > > > +# define USE_WIDE_CHAR > > > + > > > +# else > > > +# define VMOVU_MASK vmovdqu8 > > > +# define VPCMPEQ vpcmpeqb > > > +# define VPMIN vpminub > > > +# define VPTESTN vptestnmb > > > +# define VPTEST vptestmb > > > +# define CHAR_SIZE 1 > > > + > > > +# define REP_MOVS rep movsb > > > +# define REP_STOS rep stosb > > > +# endif > > > + > > > +# include "strncpy-or-cat-overflow-def.h" > > > + > > > +# define PAGE_SIZE 4096 > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > + > > > +# include "reg-macros.h" > > > + > > > + > > > +# define VZERO VMM(7) > > > +# define VZERO_256 VMM_256(7) > > > +# define VZERO_128 VMM_128(7) > > > + > > > +# if VEC_SIZE == 64 > > > +# define VZERO_HALF VZERO_256 > > > +# else > > > +# define VZERO_HALF VZERO_128 > > > +# endif > > > + > > > + .section SECTION(.text), "ax", @progbits > > > +ENTRY(STRNCPY) > > > + /* Filter zero length strings and very long strings. Zero > > > + length strings just return, very long strings are handled by > > > + just running rep stos{b|l} to zero set (which will almost > > > + certainly segfault), if that succeeds then just calling > > > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ > > > +# ifdef USE_AS_WCSCPY > > > + decq %rdx > > > + movq %rdx, %rax > > > + /* 56 is end of max supported address space. */ > > > + shr $56, %rax > > > + jnz L(zero_len) > > > +# else > > > + decq %rdx > > > + /* If the flag needs to become `jb` replace `dec` with `sub`. > > > + */ > > > + jl L(zero_len) > > > +# endif > > > + > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > + movl %esi, %eax > > > + andl $(PAGE_SIZE - 1), %eax > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > + ja L(page_cross) > > > + > > > +L(page_cross_continue): > > > + VMOVU (%rsi), %VMM(0) > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + > > > + /* If no STPCPY just save end ahead of time. */ > > > +# ifndef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > + > > > + > > > + cmpq $(CHAR_PER_VEC), %rdx > > > + > > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > > + <= CHAR_PER_VEC with masked instructions (which have > > > + potential for dramatically bad perf if dst splits a page and > > > + is not in the TLB). */ > > > +# if USE_EVEX_MASKED_STORE > > > + /* `jae` because length rdx is now length - 1. */ > > > + jae L(more_1x_vec) > > > + > > > + /* If there where multiple zero-CHAR matches in the first VEC, > > > + VRCX will be overset but thats fine since any oversets where > > > + at zero-positions anyways. */ > > > + > > > +# ifdef USE_AS_STPCPY > > > + tzcnt %VRCX, %VRAX > > > + cmpl %eax, %edx > > > + cmovb %edx, %eax > > > +# ifdef USE_AS_WCSCPY > > > + adcl $0, %eax > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > +# else > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + dec %VRCX > > > + > > > + /* Zero out all non-zero CHAR's after the first zero match. */ > > > + KMOV %VRCX, %k1 > > > + > > > + /* Use VZERO as destination so this can be reused for > > > + L(zfill_less_vec) (which if jumped to by subsequent logic > > > + will have zerod out VZERO. */ > > > + VMOVU_MASK %VMM(0), %VZERO{%k1}{z} > > > +L(zfill_less_vec): > > > + /* Get mask for what we need to set. */ > > > + incl %edx > > > + mov $-1, %VRCX > > > + bzhi %VRDX, %VRCX, %VRCX > > > + KMOV %VRCX, %k1 > > > + VMOVU_MASK %VZERO, (%rdi){%k1} > > > + ret > > > + > > > + .p2align 4,, 4 > > > +L(zero_len): > > > + cmpq $-1, %rdx > > > + jne L(best_effort_strncpy) > > > + movq %rdi, %rax > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(more_1x_vec): > > > +# else > > > + /* `jb` because length rdx is now length - 1. */ > > > + jb L(less_1x_vec) > > > +# endif > > > + > > > + > > > + /* This may overset but thats fine because we still need to zero > > > + fill. */ > > > + VMOVU %VMM(0), (%rdi) > > > + > > > + > > > + /* Length must be >= CHAR_PER_VEC so match here means we must > > > + zero-fill. */ > > > + test %VRCX, %VRCX > > > + jnz L(zfill) > > > + > > > + > > > + /* We are going to align rsi here so will need to be able to re- > > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > > + subq %rsi, %rdi > > > + andq $-(VEC_SIZE), %rsi > > > + > > > +L(loop_last_4x_vec): > > > + addq %rsi, %rdi > > > + subq %rsi, %rdx > > > +# ifdef USE_AS_WCSCPY > > > + shrq $2, %rdx > > > +# endif > > > + > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > + KMOV %k0, %VRCX > > > + > > > + /* -1 because of the `dec %rdx` earlier. */ > > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > > + ja L(more_2x_vec) > > > + > > > +L(last_2x_vec): > > > + /* This will be need to be computed no matter what. We do it > > > + ahead of time for CHAR_PER_VEC == 64 because we can't adjust > > > + the value of `tzcnt` with a shift. */ > > > +# if CHAR_PER_VEC == 64 > > > + tzcntq %rcx, %rcx > > > +# endif > > > + > > > + cmpl $(CHAR_PER_VEC), %edx > > > + jb L(ret_vec_x1_len) > > > + > > > + /* Seperate logic for CHAR_PER_VEC == 64 because we already did > > > + `tzcnt` on VRCX. */ > > > +# if CHAR_PER_VEC == 64 > > > + /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ > > > + cmpb $CHAR_PER_VEC, %cl > > > + jnz L(ret_vec_x1_no_bsf) > > > +# else > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x1) > > > +# endif > > > + > > > + > > > + > > > + VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > + KMOV %k0, %VRCX > > > + > > > +# if CHAR_PER_VEC < 64 > > > + /* This essentiallys adds CHAR_PER_VEC to computed result. */ > > > + shlq $CHAR_PER_VEC, %rcx > > > +# else > > > + tzcntq %rcx, %rcx > > > + addl $CHAR_PER_VEC, %ecx > > > +# endif > > > + > > > + .p2align 4,, 4 > > > +L(ret_vec_x1_len): > > > + /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has > > > + already been done. */ > > > +# if CHAR_PER_VEC < 64 > > > + tzcntq %rcx, %rcx > > > +# endif > > > + cmpl %ecx, %edx > > > + jbe L(ret_vec_x1_len_no_zfill) > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > +L(ret_vec_x1_len_no_zfill_mov): > > > + movl %ecx, %edx > > > +# ifdef USE_AS_STPCPY > > > + /* clear flags. */ > > > + xorl %ecx, %ecx > > > +# endif > > > +L(ret_vec_x1_len_no_zfill): > > > + VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + leal (VEC_SIZE)(%rdx), %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > + > > > + > > > + .p2align 4,, 10 > > > +L(ret_vec_x1): > > > + bsf %VRCX, %VRCX > > > +L(ret_vec_x1_no_bsf): > > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > + subl %ecx, %edx > > > + cmpl $CHAR_PER_VEC, %edx > > > + jb L(ret_vec_x1_len_no_zfill_mov) > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > + VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > + leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax > > > +# endif > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(last_4x_vec): > > > + /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl > > > + $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just > > > + using `movzbl`. */ > > > +# if CHAR_PER_VEC == 64 > > > + movzbl %dl, %edx > > > +# else > > > + andl $(CHAR_PER_VEC * 4 - 1), %edx > > > +# endif > > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > + KMOV %k0, %VRCX > > > + subq $-(VEC_SIZE * 4), %rsi > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpl $(CHAR_PER_VEC * 2 - 1), %edx > > > + jbe L(last_2x_vec) > > > + .p2align 4,, 8 > > > +L(more_2x_vec): > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > + test %VRCX, %VRCX > > > + /* Must fill at least 2x VEC. */ > > > + jnz L(zfill_vec1) > > > + > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + /* Must fill at least 1x VEC. */ > > > + jnz L(zfill_vec2) > > > + > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > + KMOV %k0, %VRCX > > > + > > > + /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ > > > + cmpq $(CHAR_PER_VEC * 4 - 1), %rdx > > > + ja L(more_4x_vec) > > > + > > > + subl $(CHAR_PER_VEC * 3), %edx > > > + jb L(ret_vec_x3_len) > > > + > > > + test %VRCX, %VRCX > > > + jnz L(ret_vec_x3) > > > + > > > + VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + KMOV %k0, %VRCX > > > + tzcnt %VRCX, %VRCX > > > + cmpl %ecx, %edx > > > + jbe L(ret_vec_x4_len_no_zfill) > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > + VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > + movl %ecx, %edx > > > +L(ret_vec_x4_len_no_zfill): > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + leal (VEC_SIZE * 4 + 0)(%rdx), %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > + > > > + > > > +L(ret_vec_x3_len): > > > + addl $(CHAR_PER_VEC * 1), %edx > > > + tzcnt %VRCX, %VRCX > > > + cmpl %ecx, %edx > > > + jbe L(ret_vec_x3_len_no_zfill) > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > +L(ret_vec_x3_len_no_zfill_mov): > > > + movl %ecx, %edx > > > +# ifdef USE_AS_STPCPY > > > + /* clear flags. */ > > > + xorl %ecx, %ecx > > > +# endif > > > + .p2align 4,, 4 > > > +L(ret_vec_x3_len_no_zfill): > > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > + VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + leal (VEC_SIZE * 3 + 0)(%rdx), %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > + > > > + > > > + .p2align 4,, 8 > > > +L(ret_vec_x3): > > > + bsf %VRCX, %VRCX > > > + VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) > > > + subl %ecx, %edx > > > + jl L(ret_vec_x3_len_no_zfill_mov) > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > > > +# endif > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(more_4x_vec): > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > + test %VRCX, %VRCX > > > + jnz L(zfill_vec3) > > > + > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(zfill_vec4) > > > > > > -#define USE_AS_STRNCPY > > > -#define STRCPY STRNCPY > > > -#include "strcpy-evex.S" > > > + /* Recheck length before aligning. */ > > > + cmpq $(CHAR_PER_VEC * 8 - 1), %rdx > > > + jbe L(last_4x_vec) > > > + > > > + /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > > +# else > > > + addq %rsi, %rdx > > > +# endif > > > + subq %rsi, %rdi > > > + subq $-(VEC_SIZE * 5), %rsi > > > + andq $(VEC_SIZE * -4), %rsi > > > + > > > + > > > + /* Load first half of the loop before entry. */ > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + > > > + > > > + /* Offset rsi by VEC_SIZE so that we can jump to > > > + L(loop_last_4x_vec). */ > > > + addq $-(VEC_SIZE), %rsi > > > + KORTEST %k2, %k4 > > > + jnz L(loop_4x_done) > > > + > > > + /* Store loop end in r9. */ > > > + leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 > > > + > > > + .p2align 4,, 11 > > > +L(loop_4x_vec): > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > > + > > > + subq $(VEC_SIZE * -4), %rsi > > > + cmpq %rsi, %r9 > > > + jbe L(loop_last_4x_vec) > > > + > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > > + > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > + KORTEST %k2, %k4 > > > + jz L(loop_4x_vec) > > > + > > > +L(loop_4x_done): > > > + /* Restore rdx (length). */ > > > + subq %rsi, %rdx > > > +# ifdef USE_AS_WCSCPY > > > + shrq $2, %rdx > > > +# endif > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > + /* Restore rdi (dst). */ > > > + addq %rsi, %rdi > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(zfill_vec1) > > > + > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > > + KMOV %k2, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(zfill_vec2) > > > + > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > + KMOV %k0, %VRCX > > > + test %VRCX, %VRCX > > > + jnz L(zfill_vec3) > > > + > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) > > > + KMOV %k4, %VRCX > > > + // Zfill more.... > > > + > > > + .p2align 4,, 4 > > > +L(zfill_vec4): > > > + subq $(VEC_SIZE * -2), %rdi > > > + addq $(CHAR_PER_VEC * -2), %rdx > > > +L(zfill_vec2): > > > + subq $(VEC_SIZE * -2), %rdi > > > + addq $(CHAR_PER_VEC * -1), %rdx > > > +L(zfill): > > > + /* VRCX must be non-zero. */ > > > + bsf %VRCX, %VRCX > > > + > > > + /* Adjust length / dst for zfill. */ > > > + subq %rcx, %rdx > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > +L(zfill_from_page_cross): > > > + > > > + /* From here on out its just memset(rdi, 0, rdx). */ > > > + cmpq $CHAR_PER_VEC, %rdx > > > + jb L(zfill_less_vec) > > > + > > > +L(zfill_more_1x_vec): > > > + VMOVU %VZERO, (%rdi) > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > > + ja L(zfill_more_2x_vec) > > > +L(zfill_done0): > > > + ret > > > + > > > + /* Coming from vec1/vec2 we must be able to zfill at least 2x > > > + VEC. */ > > > + .p2align 4,, 8 > > > +L(zfill_vec3): > > > + subq $(VEC_SIZE * -2), %rdi > > > + addq $(CHAR_PER_VEC * -2), %rdx > > > + .p2align 4,, 2 > > > +L(zfill_vec1): > > > + bsfq %rcx, %rcx > > > + /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. > > > + */ > > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > > + subq %rcx, %rdx > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > + > > > + > > > + VMOVU %VZERO, (%rdi) > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > > + jb L(zfill_done0) > > > +L(zfill_more_2x_vec): > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > > + VMOVU %VZERO, (VEC_SIZE)(%rdi) > > > + subq $(CHAR_PER_VEC * 4 - 1), %rdx > > > + jbe L(zfill_done) > > > + > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rdx > > > +# else > > > + addq %rdi, %rdx > > > +# endif > > > + > > > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) > > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) > > > + > > > + > > > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) > > > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) > > > + > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpq %rdi, %rdx > > > + jbe L(zfill_done) > > > + > > > + /* Align rdi and zfill loop. */ > > > + andq $-(VEC_SIZE), %rdi > > > + .p2align 4,, 12 > > > +L(zfill_loop_4x_vec): > > > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) > > > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) > > > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) > > > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) > > > + subq $-(VEC_SIZE * 4), %rdi > > > + cmpq %rdi, %rdx > > > + ja L(zfill_loop_4x_vec) > > > +L(zfill_done): > > > + ret > > > + > > > + > > > + /* Less 1x VEC case if we are not using evex masked store. */ > > > +# if !USE_EVEX_MASKED_STORE > > > + .p2align 4,, 8 > > > +L(copy_1x): > > > + /* Special case for copy 1x. It can be handled quickly and many > > > + buffer sizes have convenient alignment. */ > > > + VMOVU %VMM(0), (%rdi) > > > + /* If no zeros then we are done. */ > > > + testl %ecx, %ecx > > > + jz L(ret_1x_1x) > > > + > > > + /* Need to zfill, not we know that length <= CHAR_PER_VEC so we > > > + only handle the small case here. */ > > > + bsf %VRCX, %VRCX > > > +L(zfill_less_vec_no_bsf): > > > + /* Adjust length / dst then just zfill less_vec. */ > > > + subq %rcx, %rdx > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > + > > > +L(zfill_less_vec): > > > + cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx > > > + jb L(zfill_less_half) > > > + > > > + VMOVU %VZERO_HALF, (%rdi) > > > + VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + ret > > > +# ifdef USE_AS_STPCPY > > > +L(ret_1x_1x): > > > + leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax > > > + ret > > > +# endif > > > + > > > + > > > +# if VEC_SIZE == 64 > > > + .p2align 4,, 4 > > > +L(copy_32_63): > > > + /* Overfill to avoid branches. */ > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > + VMOVU %VMM_256(0), (%rdi) > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + > > > + /* We are taking advantage of the fact that to be here we must > > > + be writing null-term as (%rdi, %rcx) we have a byte of lee- > > > + way for overwriting. */ > > > + cmpl %ecx, %edx > > > + ja L(zfill_less_vec_no_bsf) > > > +# ifndef USE_AS_STPCPY > > > +L(ret_1x_1x): > > > +# else > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > +# endif > > > + > > > + .p2align 4,, 4 > > > +L(copy_16_31): > > > + /* Overfill to avoid branches. */ > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > + VMOVU %VMM_128(0), (%rdi) > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + cmpl %ecx, %edx > > > + > > > + /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then > > > + we have a larger copy block for 32-63 so this is just falls > > > + through to zfill 16-31. If VEC_SIZE == 32 then we check for > > > + full zfill of less 1x VEC. */ > > > +# if VEC_SIZE == 64 > > > + jbe L(ret_16_31) > > > + subl %ecx, %edx > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > +L(zfill_less_half): > > > +L(zfill_less_32): > > > + cmpl $(16 / CHAR_SIZE), %edx > > > + jb L(zfill_less_16) > > > + VMOVU %VZERO_128, (%rdi) > > > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > + ret > > > +# endif > > > +L(ret_16_31): > > > +# ifdef USE_AS_STPCPY > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > +# else > > > + /* VEC_SIZE == 32 begins. */ > > > + ja L(zfill_less_vec_no_bsf) > > > +# ifndef USE_AS_STPCPY > > > +L(ret_1x_1x): > > > +# else > > > +# ifdef USE_AS_WCSCPY > > > + adcq $0, %rdx > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# endif > > > + ret > > > +# endif > > > + > > > + > > > + .p2align 4,, 4 > > > +L(copy_8_15): > > > + /* Overfill to avoid branches. */ > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > > + vmovq %VMM_128(0), (%rdi) > > > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > + cmpl %ecx, %edx > > > + jbe L(ret_8_15) > > > + subl %ecx, %edx > > > +# ifdef USE_AS_WCSCPY > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > +# else > > > + addq %rcx, %rdi > > > +# endif > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > + .p2align 4,, 8 > > > +# if VEC_SIZE == 32 > > > +L(zfill_less_half): > > > +# endif > > > +L(zfill_less_16): > > > + xorl %ecx, %ecx > > > + cmpl $(8 / CHAR_SIZE), %edx > > > + jb L(zfill_less_8) > > > + movq %rcx, (%rdi) > > > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > +# ifndef USE_AS_STPCPY > > > +L(ret_8_15): > > > +# endif > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(less_1x_vec): > > > + je L(copy_1x) > > > + > > > + /* We will need `tzcnt` result for all other copy sizes. */ > > > + tzcnt %VRCX, %VRCX > > > +# if VEC_SIZE == 64 > > > + cmpl $(32 / CHAR_SIZE), %edx > > > + jae L(copy_32_63) > > > +# endif > > > + > > > + cmpl $(16 / CHAR_SIZE), %edx > > > + jae L(copy_16_31) > > > + > > > + cmpl $(8 / CHAR_SIZE), %edx > > > + jae L(copy_8_15) > > > +# ifdef USE_AS_WCSCPY > > > + testl %ecx, %ecx > > > + jz L(zfill_less_8_set_ret) > > > + > > > + movl (%rsi, %rdx, CHAR_SIZE), %esi > > > + vmovd %VMM_128(0), (%rdi) > > > + movl %esi, (%rdi, %rdx, CHAR_SIZE) > > > +# ifdef USE_AS_STPCPY > > > + cmpl %ecx, %edx > > > +L(ret_8_15): > > > + adcq $0, %rdx > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# endif > > > + ret > > > +L(zfill_less_8_set_ret): > > > + xorl %ecx, %ecx > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > +L(zfill_less_8): > > > + movl %ecx, (%rdi) > > > + movl %ecx, (%rdi, %rdx, CHAR_SIZE) > > > + ret > > > +# else > > > + cmpl $3, %edx > > > + jb L(copy_0_3) > > > + /* Overfill to avoid branches. */ > > > + movl -3(%rsi, %rdx), %esi > > > + vmovd %VMM_128(0), (%rdi) > > > + movl %esi, -3(%rdi, %rdx) > > > + cmpl %ecx, %edx > > > + jbe L(ret_4_7) > > > + subq %rcx, %rdx > > > + addq %rcx, %rdi > > > +# ifdef USE_AS_STPCPY > > > + movq %rdi, %rax > > > +# endif > > > + xorl %ecx, %ecx > > > + .p2align 4,, 8 > > > +L(zfill_less_8): > > > + cmpl $3, %edx > > > + jb L(zfill_less_3) > > > + movl %ecx, (%rdi) > > > + movl %ecx, -3(%rdi, %rdx) > > > +# ifdef USE_AS_STPCPY > > > + ret > > > +# endif > > > + > > > +L(ret_4_7): > > > +# ifdef USE_AS_STPCPY > > > +L(ret_8_15): > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > +# endif > > > + ret > > > + > > > + .p2align 4,, 4 > > > +L(zfill_less_3): > > > + testl %edx, %edx > > > + jz L(zfill_1) > > > + movw %cx, (%rdi) > > > +L(zfill_1): > > > + movb %cl, (%rdi, %rdx) > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(copy_0_3): > > > + vmovd %VMM_128(0), %r8d > > > + testl %edx, %edx > > > + jz L(copy_1) > > > + movw %r8w, (%rdi) > > > + cmpl %ecx, %edx > > > + ja L(zfill_from_1) > > > + movzbl (%rsi, %rdx), %r8d > > > +# ifdef USE_AS_STPCPY > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > + movb %r8b, (%rdi, %rdx) > > > + ret > > > +# endif > > > + > > > +L(copy_1): > > > +# ifdef USE_AS_STPCPY > > > + movl %edx, %eax > > > + cmpl %ecx, %edx > > > + adcq %rdi, %rax > > > +# endif > > > +# ifdef USE_AS_WCSCPY > > > + vmovd %VMM_128(0), (%rdi) > > > +# else > > > + movb %r8b, (%rdi, %rdx) > > > +# endif > > > + ret > > > +# endif > > > + > > > + > > > +# ifndef USE_AS_WCSCPY > > > + .p2align 4,, 8 > > > +L(zfill_from_1): > > > +# ifdef USE_AS_STPCPY > > > + leaq (%rdi, %rcx), %rax > > > +# endif > > > + movw $0, -1(%rdi, %rdx) > > > + ret > > > +# endif > > > + > > > + .p2align 4,, 4 > > > +L(zero_len): > > > + incq %rdx > > > + jne L(best_effort_strncpy) > > > + movq %rdi, %rax > > > + ret > > > +# endif > > > + > > > + > > > + .p2align 4,, 4 > > > + .p2align 6,, 8 > > > +L(page_cross): > > > + movq %rsi, %rax > > > + andq $(VEC_SIZE * -1), %rax > > > + VPCMPEQ (%rax), %VZERO, %k0 > > > + KMOV %k0, %VRCX > > > +# ifdef USE_AS_WCSCPY > > > + movl %esi, %r8d > > > + shrl $2, %r8d > > > + andl $(CHAR_PER_VEC - 1), %r8d > > > + shrx %VR8, %VRCX, %VRCX > > > +# else > > > + shrx %VRSI, %VRCX, %VRCX > > > +# endif > > > + > > > + /* Compute amount of bytes we checked. */ > > > + subl %esi, %eax > > > + andl $(VEC_SIZE - 1), %eax > > > +# ifdef USE_AS_WCSCPY > > > + shrl $2, %eax > > > +# endif > > > + > > > + /* If rax > rdx then we are finishing the copy at the end of the > > > + page. */ > > > + cmpq %rax, %rdx > > > + jb L(page_cross_small) > > > + > > > + > > > + /* If rcx is non-zero then continue. */ > > > + test %VRCX, %VRCX > > > + jz L(page_cross_continue) > > > + > > > + /* We found zero-CHAR so need to copy then zfill (we know we > > > + didn't cover all of length here). */ > > > + bsf %VRCX, %VRCX > > > +L(movsb_and_zfill): > > > + incl %ecx > > > + subq %rcx, %rdx > > > +# ifdef USE_AS_STPCPY > > > + leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > > +# else > > > + movq %rdi, %rax > > > +# endif > > > + > > > + REP_MOVS > > > +# ifdef USE_AS_WCSCPY > > > + movl $0, (%rdi) > > > +# else > > > + movb $0, (%rdi) > > > +# endif > > > + jmp L(zfill_from_page_cross) > > > + > > > +L(page_cross_small): > > > + tzcnt %VRCX, %VRCX > > > + cmpl %ecx, %edx > > > + jbe L(page_cross_copy_only) > > > + > > > + /* Do a zfill of the tail before copying. */ > > > + movq %rdi, %r9 > > > + xorl %eax, %eax > > > + > > > + movl %ecx, %r8d > > > + > > > + subl %ecx, %edx > > > + leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > > + movl %edx, %ecx > > > + REP_STOS > > > + movq %r9, %rdi > > > + movl %r8d, %edx > > > +L(page_cross_copy_only): > > > + leal 1(%rdx), %ecx > > > +# ifdef USE_AS_STPCPY > > > +# ifdef USE_AS_WCSCPY > > > + adcl $0, %edx > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > +# else > > > + movl %edx, %eax > > > + adcq %rdi, %rax > > > +# endif > > > +# else > > > + movq %rdi, %rax > > > +# endif > > > + REP_MOVS > > > + ret > > > + > > > + > > > +L(best_effort_strncpy): > > > + movq %rdx, %rcx > > > + xorl %eax, %eax > > > + movq %rdi, %r8 > > > + /* The length is >= 2^63. We very much so expect to segfault at > > > + rep stos. If that doesn't happen then just strcpy to finish. > > > + */ > > > + REP_STOS > > > + movq %r8, %rdi > > > + jmp OVERFLOW_STRCPY > > > +END(STRNCPY) > > > +#endif > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > new file mode 100644 > > > index 0000000000..d5ff4cbe50 > > > --- /dev/null > > > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > Please add a copyright notice. > > > > > @@ -0,0 +1,65 @@ > > > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ > > > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1 > > > + > > > +#if defined USE_MULTIARCH && IS_IN(libc) > > > +# define UNDERSCORES __ > > > +# ifdef USE_WITH_SSE2 > > > +# define ISA_EXT _sse2 > > > +# elif defined USE_WITH_AVX > > > +# ifdef USE_WITH_RTM > > > +# define ISA_EXT _avx_rtm > > > +# else > > > +# define ISA_EXT _avx > > > +# endif > > > +# elif defined USE_WITH_AVX2 > > > > Do we have a function with both AVX and AVX2 versions? If not, should > > keep just 1. > > > > > +# ifdef USE_WITH_RTM > > > +# define ISA_EXT _avx2_rtm > > > +# else > > > +# define ISA_EXT _avx2 > > > +# endif > > > + > > > +# elif defined USE_WITH_EVEX256 > > > +# define ISA_EXT _evex > > > +# elif defined USE_WITH_EVEX512 > > > +# define ISA_EXT _evex512 > > > +# endif > > > +#else > > > +# define UNDERSCORES > > > +# define ISA_EXT > > > +#endif > > > + > > > +#ifdef USE_AS_WCSCPY > > > +# define STRCPY_PREFIX wc > > > +# define STRCAT_PREFIX wcs > > > +# ifdef USE_AS_STPCPY > > > +# define STRCPY_POSTFIX pcpy > > > +# else > > > +# define STRCPY_POSTFIX scpy > > > +# endif > > > +#else > > > +# define STRCPY_PREFIX st > > > +# define STRCAT_PREFIX str > > > +# ifdef USE_AS_STPCPY > > > +# define STRCPY_POSTFIX pcpy > > > +# else > > > +# define STRCPY_POSTFIX rcpy > > > +# endif > > > +#endif > > > +#define STRCAT_POSTFIX cat > > > + > > > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \ > > > + underscores##prefix##postfix##ext > > > + > > > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__) > > > + > > > +#ifndef OVERFLOW_STRCPY > > > +# define OVERFLOW_STRCPY \ > > > + OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT) > > > +#endif > > > + > > > +#ifndef OVERFLOW_STRCAT > > > +# define OVERFLOW_STRCAT \ > > > + OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT) > > > +#endif > > > + > > > +#endif > > > -- > > > 2.34.1 > > > > > > > H.J.
On Fri, Nov 4, 2022 at 3:47 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Fri, Nov 4, 2022 at 3:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > On Fri, Nov 4, 2022 at 2:46 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > On Fri, Nov 04, 2022 at 01:13:11PM -0700, Noah Goldstein wrote: > > > > Optimizations are: > > > > 1. Use more overlapping stores to avoid branches. > > > > 2. Reduce how unrolled the aligning copies are (this is more of a > > > > code-size save, its a negative for some sizes in terms of > > > > perf). > > > > 3. Improve the loop a bit (similiar to what we do in strlen with > > > > 2x vpminu + kortest instead of 3x vpminu + kmov + test). > > > > 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the > > > > number that are taken. > > > > > > > > Performance Changes: > > > > > > > > Times are from N = 10 runs of the benchmark suite and are > > > > reported as geometric mean of all ratios of > > > > New Implementation / Old Implementation. > > > > > > > > stpcpy-evex -> 0.922 > > > > strcat-evex -> 0.985 > > > > strcpy-evex -> 0.880 > > > > > > > > strncpy-evex -> 0.831 > > > > stpncpy-evex -> 0.780 > > > > > > > > strncat-evex -> 0.958 > > > > > > > > Code Size Changes: > > > > function -> Bytes New / Bytes Old -> Ratio > > > > > > > > strcat-evex -> 819 / 1874 -> 0.437 > > > > strcpy-evex -> 700 / 1074 -> 0.652 > > > > stpcpy-evex -> 735 / 1094 -> 0.672 > > > > > > > > strncpy-evex -> 1397 / 2611 -> 0.535 > > > > stpncpy-evex -> 1489 / 2691 -> 0.553 > > > > > > > > strncat-evex -> 1184 / 2832 -> 0.418 > > > > > > > > Notes: > > > > 1. Because of the significant difference between the > > > > implementations they are split into three files. > > > > > > > > strcpy-evex.S -> strcpy, stpcpy, strcat > > > > strncpy-evex.S -> strncpy > > > > strncat-evex.S > strncat > > > > > > > > I couldn't find a way to merge them without making the > > > > ifdefs incredibly difficult to follow. > > > > > > > > 2. All implementations can be made evex512 by including > > > > "x86-evex512-vecs.h" at the top. > > > > > > > > 3. All implementations have an optional define: > > > > `USE_EVEX_MASKED_STORE` > > > > Setting to one uses evex-masked stores for handling short > > > > strings. This saves code size and branches. It's disabled > > > > for all implementations are the moment as there are some > > > > serious drawbacks to masked stores in certain cases, but > > > > that may be fixed on future architectures. > > > > > > > > Full check passes on x86-64 and build succeeds for all ISA levels w/ > > > > and w/o multiarch. > > > > --- > > > > sysdeps/x86_64/multiarch/stpncpy-evex.S | 5 +- > > > > sysdeps/x86_64/multiarch/strcat-evex.S | 291 +--- > > > > sysdeps/x86_64/multiarch/strcat-strlen-evex.S | 110 ++ > > > > sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 ++++++----------- > > > > sysdeps/x86_64/multiarch/strncat-evex.S | 525 ++++++- > > > > sysdeps/x86_64/multiarch/strncpy-evex.S | 995 ++++++++++++- > > > > .../multiarch/strncpy-or-cat-overflow-def.h | 65 + > > > > 7 files changed, 2100 insertions(+), 1173 deletions(-) > > > > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > > create mode 100644 sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > > index 99ea76a372..3693491baa 100644 > > > > --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > > +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S > > > > @@ -3,6 +3,5 @@ > > > > #endif > > > > > > > > #define USE_AS_STPCPY > > > > -#define USE_AS_STRNCPY > > > > -#define STRCPY STPNCPY > > > > -#include "strcpy-evex.S" > > > > +#define STRNCPY STPNCPY > > > > +#include "strncpy-evex.S" > > > > diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S > > > > index 0e2df947e9..b4207b7889 100644 > > > > --- a/sysdeps/x86_64/multiarch/strcat-evex.S > > > > +++ b/sysdeps/x86_64/multiarch/strcat-evex.S > > > > @@ -1,286 +1,7 @@ > > > > -/* strcat with 256-bit EVEX instructions. > > > > - Copyright (C) 2021-2022 Free Software Foundation, Inc. > > > > - This file is part of the GNU C Library. > > > > - > > > > - The GNU C Library is free software; you can redistribute it and/or > > > > - modify it under the terms of the GNU Lesser General Public > > > > - License as published by the Free Software Foundation; either > > > > - version 2.1 of the License, or (at your option) any later version. > > > > - > > > > - The GNU C Library is distributed in the hope that it will be useful, > > > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > - Lesser General Public License for more details. > > > > - > > > > - You should have received a copy of the GNU Lesser General Public > > > > - License along with the GNU C Library; if not, see > > > > - <https://www.gnu.org/licenses/>. */ > > > > - > > > > -#include <isa-level.h> > > > > - > > > > -#if ISA_SHOULD_BUILD (4) > > > > - > > > > - > > > > -# include <sysdep.h> > > > > - > > > > -# ifndef STRCAT > > > > -# define STRCAT __strcat_evex > > > > -# endif > > > > - > > > > -# define VMOVU vmovdqu64 > > > > -# define VMOVA vmovdqa64 > > > > - > > > > -/* zero register */ > > > > -# define XMMZERO xmm16 > > > > -# define YMMZERO ymm16 > > > > -# define YMM0 ymm17 > > > > -# define YMM1 ymm18 > > > > - > > > > -# define USE_AS_STRCAT > > > > - > > > > -/* Number of bytes in a vector register */ > > > > -# define VEC_SIZE 32 > > > > - > > > > - .section .text.evex,"ax",@progbits > > > > -ENTRY (STRCAT) > > > > - mov %rdi, %r9 > > > > -# ifdef USE_AS_STRNCAT > > > > - mov %rdx, %r8 > > > > -# endif > > > > - > > > > - xor %eax, %eax > > > > - mov %edi, %ecx > > > > - and $((VEC_SIZE * 4) - 1), %ecx > > > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > > - cmp $(VEC_SIZE * 3), %ecx > > > > - ja L(fourth_vector_boundary) > > > > - vpcmpb $0, (%rdi), %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_first_vector) > > > > - mov %rdi, %rax > > > > - and $-VEC_SIZE, %rax > > > > - jmp L(align_vec_size_start) > > > > -L(fourth_vector_boundary): > > > > - mov %rdi, %rax > > > > - and $-VEC_SIZE, %rax > > > > - vpcmpb $0, (%rax), %YMMZERO, %k0 > > > > - mov $-1, %r10d > > > > - sub %rax, %rcx > > > > - shl %cl, %r10d > > > > - kmovd %k0, %edx > > > > - and %r10d, %edx > > > > - jnz L(exit) > > > > - > > > > -L(align_vec_size_start): > > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_second_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_third_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fourth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fifth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > > - add $(VEC_SIZE * 4), %rax > > > > - kmovd %k4, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_second_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_third_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fourth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fifth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > > - kmovd %k4, %edx > > > > - add $(VEC_SIZE * 4), %rax > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_second_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_third_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fourth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fifth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > > - add $(VEC_SIZE * 4), %rax > > > > - kmovd %k4, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_second_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_third_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fourth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fifth_vector) > > > > - > > > > - test $((VEC_SIZE * 4) - 1), %rax > > > > - jz L(align_four_vec_loop) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 > > > > - add $(VEC_SIZE * 5), %rax > > > > - kmovd %k4, %edx > > > > - test %edx, %edx > > > > - jnz L(exit) > > > > - > > > > - test $((VEC_SIZE * 4) - 1), %rax > > > > - jz L(align_four_vec_loop) > > > > - > > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > > - add $VEC_SIZE, %rax > > > > - kmovd %k0, %edx > > > > - test %edx, %edx > > > > - jnz L(exit) > > > > - > > > > - test $((VEC_SIZE * 4) - 1), %rax > > > > - jz L(align_four_vec_loop) > > > > - > > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 > > > > - add $VEC_SIZE, %rax > > > > - kmovd %k0, %edx > > > > - test %edx, %edx > > > > - jnz L(exit) > > > > - > > > > - test $((VEC_SIZE * 4) - 1), %rax > > > > - jz L(align_four_vec_loop) > > > > - > > > > - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 > > > > - add $VEC_SIZE, %rax > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit) > > > > - > > > > - add $VEC_SIZE, %rax > > > > - > > > > - .p2align 4 > > > > -L(align_four_vec_loop): > > > > - VMOVA (%rax), %YMM0 > > > > - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 > > > > - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 > > > > - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 > > > > - vpminub %YMM0, %YMM1, %YMM0 > > > > - /* If K0 != 0, there is a null byte. */ > > > > - vpcmpb $0, %YMM0, %YMMZERO, %k0 > > > > - add $(VEC_SIZE * 4), %rax > > > > - ktestd %k0, %k0 > > > > - jz L(align_four_vec_loop) > > > > - > > > > - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 > > > > - sub $(VEC_SIZE * 5), %rax > > > > - kmovd %k0, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_second_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_third_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - test %edx, %edx > > > > - jnz L(exit_null_on_fourth_vector) > > > > - > > > > - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - sub %rdi, %rax > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - add $(VEC_SIZE * 4), %rax > > > > - jmp L(StartStrcpyPart) > > > > - > > > > - .p2align 4 > > > > -L(exit): > > > > - sub %rdi, %rax > > > > -L(exit_null_on_first_vector): > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - jmp L(StartStrcpyPart) > > > > - > > > > - .p2align 4 > > > > -L(exit_null_on_second_vector): > > > > - sub %rdi, %rax > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - add $VEC_SIZE, %rax > > > > - jmp L(StartStrcpyPart) > > > > - > > > > - .p2align 4 > > > > -L(exit_null_on_third_vector): > > > > - sub %rdi, %rax > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - add $(VEC_SIZE * 2), %rax > > > > - jmp L(StartStrcpyPart) > > > > - > > > > - .p2align 4 > > > > -L(exit_null_on_fourth_vector): > > > > - sub %rdi, %rax > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - add $(VEC_SIZE * 3), %rax > > > > - jmp L(StartStrcpyPart) > > > > - > > > > - .p2align 4 > > > > -L(exit_null_on_fifth_vector): > > > > - sub %rdi, %rax > > > > - bsf %rdx, %rdx > > > > - add %rdx, %rax > > > > - add $(VEC_SIZE * 4), %rax > > > > - > > > > - .p2align 4 > > > > -L(StartStrcpyPart): > > > > - lea (%r9, %rax), %rdi > > > > - mov %rsi, %rcx > > > > - mov %r9, %rax /* save result */ > > > > - > > > > -# ifdef USE_AS_STRNCAT > > > > - test %r8, %r8 > > > > - jz L(ExitZero) > > > > -# define USE_AS_STRNCPY > > > > -# endif > > > > - > > > > -# include "strcpy-evex.S" > > > > +#ifndef STRCAT > > > > +# define STRCAT __strcat_evex > > > > #endif > > > > + > > > > +#define USE_AS_STRCAT > > > > +#define STRCPY STRCAT > > > > +#include "strcpy-evex.S" > > > > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > > new file mode 100644 > > > > index 0000000000..9530d7b683 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S > > > > @@ -0,0 +1,110 @@ > > > > +/* strlen used for begining of str{n}cat using EVEX 256/512. > > > > + Copyright (C) 2011-2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > + > > > > +/* NOTE: This file is meant to be included by strcat-evex or > > > > + strncat-evex and does not standalone. Before including %rdi > > > > + must be saved in %rax. */ > > > > > > Since this file isn't standalone, please rename it to .h. > > > > Can it be .h.S so it plays well it IDE modes? > > It sounds reasonable. Fixed in V4. > > > > > > > > + > > > > + > > > > +/* Simple strlen implementation that ends at > > > > + L(strcat_strlen_done). */ > > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > > + movq %rdi, %r8 > > > > + andq $(VEC_SIZE * -1), %r8 > > > > + VPCMPEQ (%r8), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > +#ifdef USE_AS_WCSCPY > > > > + subl %r8d, %edi > > > > + shrl $2, %edi > > > > +#endif > > > > + shrx %VRDI, %VRCX, %VRCX > > > > +#ifdef USE_AS_WCSCPY > > > > + movq %rax, %rdi > > > > +#endif > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v0) > > > > + > > > > + > > > > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > + leaq (VEC_SIZE)(%r8), %rdi > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v0) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v1) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v2) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v3) > > > > + > > > > + andq $-(VEC_SIZE * 4), %rdi > > > > + .p2align 4,, 8 > > > > +L(loop_2x_vec): > > > > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) > > > > + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) > > > > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) > > > > + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) > > > > + VPTESTN %VMM(1), %VMM(1), %k1 > > > > + VPTESTN %VMM(3), %VMM(3), %k3 > > > > + subq $(VEC_SIZE * -4), %rdi > > > > + KORTEST %k1, %k3 > > > > + jz L(loop_2x_vec) > > > > + > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v0) > > > > + > > > > + KMOV %k1, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v1) > > > > + > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(bsf_and_done_v2) > > > > + > > > > + KMOV %k3, %VRCX > > > > +L(bsf_and_done_v3): > > > > + addq $VEC_SIZE, %rdi > > > > +L(bsf_and_done_v2): > > > > + bsf %VRCX, %VRCX > > > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi > > > > + jmp L(strcat_strlen_done) > > > > + > > > > + .p2align 4,, 4 > > > > +L(bsf_and_done_v1): > > > > + addq $VEC_SIZE, %rdi > > > > +L(bsf_and_done_v0): > > > > + bsf %VRCX, %VRCX > > > > +#ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +#else > > > > + addq %rcx, %rdi > > > > +#endif > > > > +L(strcat_strlen_done): > > > > diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S > > > > index 82e45ac675..1ba0195ed2 100644 > > > > --- a/sysdeps/x86_64/multiarch/strcpy-evex.S > > > > +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S > > > > @@ -1,4 +1,4 @@ > > > > -/* strcpy with 256-bit EVEX instructions. > > > > +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. > > > > Copyright (C) 2021-2022 Free Software Foundation, Inc. > > > > This file is part of the GNU C Library. > > > > > > > > @@ -17,990 +17,526 @@ > > > > <https://www.gnu.org/licenses/>. */ > > > > > > > > #include <isa-level.h> > > > > - > > > > #if ISA_SHOULD_BUILD (4) > > > > > > > > > > > > -# ifndef USE_AS_STRCAT > > > > -# include <sysdep.h> > > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > > + moment. */ > > > > +# define USE_EVEX_MASKED_STORE 0 > > > > + /* Use movsb in page cross case to save code size. */ > > > > +# define USE_MOVSB_IN_PAGE_CROSS 1 > > > > > > > > -# ifndef STRCPY > > > > -# define STRCPY __strcpy_evex > > > > -# endif > > > > +# include <sysdep.h> > > > > > > > > +# ifndef VEC_SIZE > > > > +# include "x86-evex256-vecs.h" > > > > # endif > > > > > > > > -# define VMOVU vmovdqu64 > > > > -# define VMOVA vmovdqa64 > > > > - > > > > -/* Number of bytes in a vector register */ > > > > -# ifndef VEC_SIZE > > > > -# define VEC_SIZE 32 > > > > +# ifndef STRCPY > > > > +# define STRCPY __strcpy_evex > > > > # endif > > > > > > > > -# define XMM2 xmm18 > > > > -# define XMM3 xmm19 > > > > > > > > -# define YMM2 ymm18 > > > > -# define YMM3 ymm19 > > > > -# define YMM4 ymm20 > > > > -# define YMM5 ymm21 > > > > -# define YMM6 ymm22 > > > > -# define YMM7 ymm23 > > > > +# ifdef USE_AS_WCSCPY > > > > +# define VMOVU_MASK vmovdqu32 > > > > +# define VPMIN vpminud > > > > +# define VPTESTN vptestnmd > > > > +# define VPTEST vptestmd > > > > +# define VPCMPEQ vpcmpeqd > > > > +# define CHAR_SIZE 4 > > > > > > > > -# ifndef USE_AS_STRCAT > > > > +# define REP_MOVS rep movsd > > > > > > > > -/* zero register */ > > > > -# define XMMZERO xmm16 > > > > -# define YMMZERO ymm16 > > > > -# define YMM1 ymm17 > > > > - > > > > - .section .text.evex,"ax",@progbits > > > > -ENTRY (STRCPY) > > > > -# ifdef USE_AS_STRNCPY > > > > - mov %RDX_LP, %R8_LP > > > > - test %R8_LP, %R8_LP > > > > - jz L(ExitZero) > > > > -# endif > > > > - mov %rsi, %rcx > > > > -# ifndef USE_AS_STPCPY > > > > - mov %rdi, %rax /* save result */ > > > > -# endif > > > > +# define USE_WIDE_CHAR > > > > +# else > > > > +# define VMOVU_MASK vmovdqu8 > > > > +# define VPMIN vpminub > > > > +# define VPTESTN vptestnmb > > > > +# define VPTEST vptestmb > > > > +# define VPCMPEQ vpcmpeqb > > > > +# define CHAR_SIZE 1 > > > > > > > > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > > > > +# define REP_MOVS rep movsb > > > > # endif > > > > > > > > - and $((VEC_SIZE * 4) - 1), %ecx > > > > - cmp $(VEC_SIZE * 2), %ecx > > > > - jbe L(SourceStringAlignmentLessTwoVecSize) > > > > - > > > > - and $-VEC_SIZE, %rsi > > > > - and $(VEC_SIZE - 1), %ecx > > > > - > > > > - vpcmpb $0, (%rsi), %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - shr %cl, %rdx > > > > +# include "reg-macros.h" > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > > - mov $VEC_SIZE, %r10 > > > > - sub %rcx, %r10 > > > > - cmp %r10, %r8 > > > > -# else > > > > - mov $(VEC_SIZE + 1), %r10 > > > > - sub %rcx, %r10 > > > > - cmp %r10, %r8 > > > > -# endif > > > > - jbe L(CopyVecSizeTailCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > - jnz L(CopyVecSizeTail) > > > > - > > > > - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > - add $VEC_SIZE, %r10 > > > > - cmp %r10, %r8 > > > > - jbe L(CopyTwoVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > - jnz L(CopyTwoVecSize) > > > > - > > > > - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ > > > > - VMOVU %YMM2, (%rdi) > > > > - > > > > -/* If source address alignment != destination address alignment */ > > > > - .p2align 4 > > > > -L(UnalignVecSizeBoth): > > > > - sub %rcx, %rdi > > > > -# ifdef USE_AS_STRNCPY > > > > - add %rcx, %r8 > > > > - sbb %rcx, %rcx > > > > - or %rcx, %r8 > > > > -# endif > > > > - mov $VEC_SIZE, %rcx > > > > - VMOVA (%rsi, %rcx), %YMM2 > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $(VEC_SIZE * 3), %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec2) > > > > +# ifdef USE_AS_STPCPY > > > > +# define END_REG rax > > > > # else > > > > - jnz L(CopyVecSize) > > > > +# define END_REG rdi, %rdx, CHAR_SIZE > > > > # endif > > > > > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec3) > > > > +# ifdef USE_AS_STRCAT > > > > +# define PAGE_ALIGN_REG edx > > > > +# define PAGE_ALIGN_REG_64 rdx > > > > # else > > > > - jnz L(CopyVecSize) > > > > +# define PAGE_ALIGN_REG eax > > > > +# define PAGE_ALIGN_REG_64 rax > > > > # endif > > > > > > > > - VMOVU %YMM3, (%rdi, %rcx) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 > > > > - vpcmpb $0, %YMM4, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec4) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > +# define VZERO VMM(7) > > > > +# define VZERO_128 VMM_128(7) > > > > > > > > - VMOVU %YMM4, (%rdi, %rcx) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec2) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec2) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > > > > > - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > - add $VEC_SIZE, %rcx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec3) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > > > > > - VMOVU %YMM3, (%rdi, %rcx) > > > > - mov %rsi, %rdx > > > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > > > - and $-(VEC_SIZE * 4), %rsi > > > > - sub %rsi, %rdx > > > > - sub %rdx, %rdi > > > > -# ifdef USE_AS_STRNCPY > > > > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 > > > > -# endif > > > > -L(UnalignedFourVecSizeLoop): > > > > - VMOVA (%rsi), %YMM4 > > > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > > > - vpminub %YMM5, %YMM4, %YMM2 > > > > - vpminub %YMM7, %YMM6, %YMM3 > > > > - vpminub %YMM2, %YMM3, %YMM2 > > > > - /* If K7 != 0, there is a null byte. */ > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > > > - kmovd %k7, %edx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $(VEC_SIZE * 4), %r8 > > > > - jbe L(UnalignedLeaveCase2OrCase3) > > > > + .section SECTION(.text), "ax", @progbits > > > > +ENTRY(STRCPY) > > > > +# ifdef USE_AS_STRCAT > > > > + movq %rdi, %rax > > > > +# include "strcat-strlen-evex.S" > > > > # endif > > > > - test %edx, %edx > > > > - jnz L(UnalignedFourVecSizeLeave) > > > > - > > > > -L(UnalignedFourVecSizeLoop_start): > > > > - add $(VEC_SIZE * 4), %rdi > > > > - add $(VEC_SIZE * 4), %rsi > > > > - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) > > > > - VMOVA (%rsi), %YMM4 > > > > - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) > > > > - VMOVA VEC_SIZE(%rsi), %YMM5 > > > > - vpminub %YMM5, %YMM4, %YMM2 > > > > - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) > > > > - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 > > > > - VMOVU %YMM7, -VEC_SIZE(%rdi) > > > > - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 > > > > - vpminub %YMM7, %YMM6, %YMM3 > > > > - vpminub %YMM2, %YMM3, %YMM2 > > > > - /* If K7 != 0, there is a null byte. */ > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k7 > > > > - kmovd %k7, %edx > > > > -# ifdef USE_AS_STRNCPY > > > > - sub $(VEC_SIZE * 4), %r8 > > > > - jbe L(UnalignedLeaveCase2OrCase3) > > > > + > > > > + movl %esi, %PAGE_ALIGN_REG > > > > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG > > > > + ja L(page_cross) > > > > +L(page_cross_continue): > > > > + VMOVU (%rsi), %VMM(0) > > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > > + movq %rdi, %rax > > > > # endif > > > > - test %edx, %edx > > > > - jz L(UnalignedFourVecSizeLoop_start) > > > > > > > > -L(UnalignedFourVecSizeLeave): > > > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - test %edx, %edx > > > > - jnz L(CopyVecSizeUnaligned_0) > > > > > > > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > > > - kmovd %k2, %ecx > > > > - test %ecx, %ecx > > > > - jnz L(CopyVecSizeUnaligned_16) > > > > + /* Two short string implementations. One with traditional > > > > + branching approach and one with masked instructions (which > > > > + have potential for dramatically bad perf if dst splits a > > > > + page and is not in the TLB). */ > > > > +# if USE_EVEX_MASKED_STORE > > > > + VPTEST %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > +# ifdef USE_AS_WCSCPY > > > > + subl $((1 << CHAR_PER_VEC)- 1), %VRCX > > > > +# else > > > > + inc %VRCX > > > > +# endif > > > > + jz L(more_1x_vec) > > > > + KMOV %VRCX, %k1 > > > > + KXOR %k0, %k1, %k1 > > > > > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - test %edx, %edx > > > > - jnz L(CopyVecSizeUnaligned_32) > > > > - > > > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > > > - kmovd %k4, %ecx > > > > - bsf %ecx, %edx > > > > - VMOVU %YMM4, (%rdi) > > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > -# ifdef USE_AS_STPCPY > > > > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax > > > > -# endif > > > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > > > - add $(VEC_SIZE - 1), %r8 > > > > - sub %rdx, %r8 > > > > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi > > > > - jmp L(StrncpyFillTailWithZero) > > > > -# else > > > > - add $(VEC_SIZE * 3), %rsi > > > > - add $(VEC_SIZE * 3), %rdi > > > > - jmp L(CopyVecSizeExit) > > > > -# endif > > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > > > > > > -/* If source address alignment == destination address alignment */ > > > > +# ifdef USE_AS_STPCPY > > > > + bsf %VRCX, %VRCX > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > > > +# endif > > > > + ret > > > > > > > > -L(SourceStringAlignmentLessTwoVecSize): > > > > - VMOVU (%rsi), %YMM3 > > > > - VMOVU VEC_SIZE(%rsi), %YMM2 > > > > - vpcmpb $0, %YMM3, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > +# else > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jz L(more_1x_vec) > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > > - cmp $VEC_SIZE, %r8 > > > > + xorl %edx, %edx > > > > + bsf %VRCX, %VRDX > > > > +# ifdef USE_AS_STPCPY > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# endif > > > > + > > > > + /* Use mask bits in rcx to detect which copy we need. If the low > > > > + mask is zero then there must be a bit set in the upper half. > > > > + I.e if rcx != 0 and ecx == 0, then match must be upper 32 > > > > + bits so we use L(copy_32_63). */ > > > > +# if VEC_SIZE == 64 > > > > +# ifdef USE_AS_WCSCPY > > > > + testb %cl, %cl > > > > +# else > > > > + testl %ecx, %ecx > > > > +# endif > > > > + jz L(copy_32_63) > > > > +# endif > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > + testb $0xf, %cl > > > > # else > > > > - cmp $(VEC_SIZE + 1), %r8 > > > > + testw %cx, %cx > > > > # endif > > > > - jbe L(CopyVecSizeTail1Case2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > - jnz L(CopyVecSizeTail1) > > > > + jz L(copy_16_31) > > > > > > > > - VMOVU %YMM3, (%rdi) > > > > - vpcmpb $0, %YMM2, %YMMZERO, %k0 > > > > - kmovd %k0, %edx > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > > - cmp $(VEC_SIZE * 2), %r8 > > > > +# ifdef USE_AS_WCSCPY > > > > + testb $0x3, %cl > > > > # else > > > > - cmp $((VEC_SIZE * 2) + 1), %r8 > > > > + testb %cl, %cl > > > > # endif > > > > - jbe L(CopyTwoVecSize1Case2OrCase3) > > > > -# endif > > > > - test %edx, %edx > > > > - jnz L(CopyTwoVecSize1) > > > > - > > > > - and $-VEC_SIZE, %rsi > > > > - and $(VEC_SIZE - 1), %ecx > > > > - jmp L(UnalignVecSizeBoth) > > > > + jz L(copy_8_15) > > > > > > > > -/*------End of main part with loops---------------------*/ > > > > > > > > -/* Case1 */ > > > > +# ifdef USE_AS_WCSCPY > > > > + vmovd %VMM_128(0), (%rdi) > > > > + /* No need to copy, we know its zero. */ > > > > + movl $0, (%END_REG) > > > > > > > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) > > > > - .p2align 4 > > > > -L(CopyVecSize): > > > > - add %rcx, %rdi > > > > -# endif > > > > -L(CopyVecSizeTail): > > > > - add %rcx, %rsi > > > > -L(CopyVecSizeTail1): > > > > - bsf %edx, %edx > > > > -L(CopyVecSizeExit): > > > > - cmp $32, %edx > > > > - jae L(Exit32_63) > > > > - cmp $16, %edx > > > > - jae L(Exit16_31) > > > > - cmp $8, %edx > > > > - jae L(Exit8_15) > > > > - cmp $4, %edx > > > > - jae L(Exit4_7) > > > > - cmp $3, %edx > > > > - je L(Exit3) > > > > - cmp $1, %edx > > > > - ja L(Exit2) > > > > - je L(Exit1) > > > > - movb $0, (%rdi) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub $1, %r8 > > > > - lea 1(%rdi), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > -# endif > > > > ret > > > > +# else > > > > > > > > - .p2align 4 > > > > -L(CopyTwoVecSize1): > > > > - add $VEC_SIZE, %rsi > > > > - add $VEC_SIZE, %rdi > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub $VEC_SIZE, %r8 > > > > -# endif > > > > - jmp L(CopyVecSizeTail1) > > > > - > > > > - .p2align 4 > > > > -L(CopyTwoVecSize): > > > > - bsf %edx, %edx > > > > - add %rcx, %rsi > > > > - add $VEC_SIZE, %edx > > > > - sub %ecx, %edx > > > > - jmp L(CopyVecSizeExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeUnaligned_0): > > > > - bsf %edx, %edx > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > -# endif > > > > - VMOVU %YMM4, (%rdi) > > > > - add $((VEC_SIZE * 4) - 1), %r8 > > > > - sub %rdx, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - jmp L(StrncpyFillTailWithZero) > > > > -# else > > > > - jmp L(CopyVecSizeExit) > > > > -# endif > > > > + testb $0x7, %cl > > > > + jz L(copy_4_7) > > > > > > > > - .p2align 4 > > > > -L(CopyVecSizeUnaligned_16): > > > > - bsf %ecx, %edx > > > > - VMOVU %YMM4, (%rdi) > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > -# ifdef USE_AS_STPCPY > > > > - lea VEC_SIZE(%rdi, %rdx), %rax > > > > -# endif > > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > > - add $((VEC_SIZE * 3) - 1), %r8 > > > > - sub %rdx, %r8 > > > > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi > > > > - jmp L(StrncpyFillTailWithZero) > > > > -# else > > > > - add $VEC_SIZE, %rsi > > > > - add $VEC_SIZE, %rdi > > > > - jmp L(CopyVecSizeExit) > > > > -# endif > > > > > > > > - .p2align 4 > > > > -L(CopyVecSizeUnaligned_32): > > > > - bsf %edx, %edx > > > > - VMOVU %YMM4, (%rdi) > > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > -# ifdef USE_AS_STPCPY > > > > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax > > > > -# endif > > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > > - add $((VEC_SIZE * 2) - 1), %r8 > > > > - sub %rdx, %r8 > > > > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi > > > > - jmp L(StrncpyFillTailWithZero) > > > > -# else > > > > - add $(VEC_SIZE * 2), %rsi > > > > - add $(VEC_SIZE * 2), %rdi > > > > - jmp L(CopyVecSizeExit) > > > > -# endif > > > > + test %edx, %edx > > > > + jz L(set_null_term) > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > -# ifndef USE_AS_STRCAT > > > > - .p2align 4 > > > > -L(CopyVecSizeUnalignedVec6): > > > > - VMOVU %YMM6, (%rdi, %rcx) > > > > - jmp L(CopyVecSizeVecExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeUnalignedVec5): > > > > - VMOVU %YMM5, (%rdi, %rcx) > > > > - jmp L(CopyVecSizeVecExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeUnalignedVec4): > > > > - VMOVU %YMM4, (%rdi, %rcx) > > > > - jmp L(CopyVecSizeVecExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeUnalignedVec3): > > > > - VMOVU %YMM3, (%rdi, %rcx) > > > > - jmp L(CopyVecSizeVecExit) > > > > + /* NB: make this `vmovw` if support for AVX512-FP16 is added. > > > > + */ > > > > + vmovd %VMM_128(0), %esi > > > > + movw %si, (%rdi) > > > > + > > > > + .p2align 4,, 1 > > > > +L(set_null_term): > > > > + /* No need to copy, we know its zero. */ > > > > + movb $0, (%END_REG) > > > > + ret > > > > # endif > > > > > > > > -/* Case2 */ > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeCase2): > > > > - add $VEC_SIZE, %r8 > > > > - add %rcx, %rdi > > > > - add %rcx, %rsi > > > > - bsf %edx, %edx > > > > - cmp %r8d, %edx > > > > - jb L(CopyVecSizeExit) > > > > - jmp L(StrncpyExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyTwoVecSizeCase2): > > > > - add %rcx, %rsi > > > > - bsf %edx, %edx > > > > - add $VEC_SIZE, %edx > > > > - sub %ecx, %edx > > > > - cmp %r8d, %edx > > > > - jb L(CopyVecSizeExit) > > > > - jmp L(StrncpyExit) > > > > - > > > > -L(CopyVecSizeTailCase2): > > > > - add %rcx, %rsi > > > > - bsf %edx, %edx > > > > - cmp %r8d, %edx > > > > - jb L(CopyVecSizeExit) > > > > - jmp L(StrncpyExit) > > > > - > > > > -L(CopyVecSizeTail1Case2): > > > > - bsf %edx, %edx > > > > - cmp %r8d, %edx > > > > - jb L(CopyVecSizeExit) > > > > - jmp L(StrncpyExit) > > > > - > > > > -/* Case2 or Case3, Case3 */ > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeCase2OrCase3): > > > > - test %rdx, %rdx > > > > - jnz L(CopyVecSizeCase2) > > > > -L(CopyVecSizeCase3): > > > > - add $VEC_SIZE, %r8 > > > > - add %rcx, %rdi > > > > - add %rcx, %rsi > > > > - jmp L(StrncpyExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyTwoVecSizeCase2OrCase3): > > > > - test %rdx, %rdx > > > > - jnz L(CopyTwoVecSizeCase2) > > > > - add %rcx, %rsi > > > > - jmp L(StrncpyExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeTailCase2OrCase3): > > > > - test %rdx, %rdx > > > > - jnz L(CopyVecSizeTailCase2) > > > > - add %rcx, %rsi > > > > - jmp L(StrncpyExit) > > > > - > > > > - .p2align 4 > > > > -L(CopyTwoVecSize1Case2OrCase3): > > > > - add $VEC_SIZE, %rdi > > > > - add $VEC_SIZE, %rsi > > > > - sub $VEC_SIZE, %r8 > > > > -L(CopyVecSizeTail1Case2OrCase3): > > > > - test %rdx, %rdx > > > > - jnz L(CopyVecSizeTail1Case2) > > > > - jmp L(StrncpyExit) > > > > +# if VEC_SIZE == 64 > > > > + .p2align 4,, 6 > > > > +L(copy_32_63): > > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > > + VMOVU %VMM_256(0), (%rdi) > > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > > > + ret > > > > +# endif > > > > + > > > > + > > > > + .p2align 4,, 6 > > > > +L(copy_16_31): > > > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > > > + and will save code size. */ > > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > > + VMOVU %VMM_128(0), (%rdi) > > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(copy_8_15): > > > > +# ifdef USE_AS_WCSCPY > > > > + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > > > +# else > > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx > > > > +# endif > > > > + vmovq %VMM_128(0), (%rdi) > > > > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) > > > > + ret > > > > # endif > > > > > > > > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ > > > > > > > > - .p2align 4 > > > > -L(Exit1): > > > > - movzwl (%rsi), %edx > > > > - mov %dx, (%rdi) > > > > -# ifdef USE_AS_STPCPY > > > > - lea 1(%rdi), %rax > > > > +# ifndef USE_AS_WCSCPY > > > > + .p2align 4,, 12 > > > > +L(copy_4_7): > > > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx > > > > + vmovd %VMM_128(0), (%rdi) > > > > + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) > > > > + ret > > > > # endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub $2, %r8 > > > > - lea 2(%rdi), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + > > > > + > > > > + .p2align 4,, 8 > > > > +L(more_1x_vec): > > > > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > > > > + VMOVU %VMM(0), (%rdi) > > > > # endif > > > > - ret > > > > + subq %rsi, %rdi > > > > + andq $-(VEC_SIZE), %rsi > > > > + addq %rsi, %rdi > > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > > > > > > - .p2align 4 > > > > -L(Exit2): > > > > - movzwl (%rsi), %ecx > > > > - mov %cx, (%rdi) > > > > - movb $0, 2(%rdi) > > > > + /* Ideally we store after moves to minimize impact of potential > > > > + false-dependencies. */ > > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > > + VMOVU %VMM(0), (%rax) > > > > +# endif > > > > + > > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x1) > > > > + > > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > > + VMOVU %VMM(1), VEC_SIZE(%rdi) > > > > + > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x2) > > > > + > > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > > + > > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > > + KMOV %k0, %VRDX > > > > + test %VRDX, %VRDX > > > > + jnz L(ret_vec_x3) > > > > + > > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x4) > > > > + > > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > > + > > > > + > > > > + /* Align for 4x loop. */ > > > > + subq %rsi, %rdi > > > > + > > > > + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE > > > > + we covered before aligning. */ > > > > + subq $-(VEC_SIZE * 5), %rsi > > > > + andq $-(VEC_SIZE * 4), %rsi > > > > + > > > > + > > > > + /* Load first half of the loop before entry. */ > > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + KORTEST %k2, %k4 > > > > + jnz L(loop_4x_done) > > > > + > > > > + .p2align 4,, 11 > > > > +L(loop_4x_vec): > > > > + > > > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > > + > > > > + subq $(VEC_SIZE * -4), %rsi > > > > + > > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > > + > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + KORTEST %k2, %k4 > > > > + jz L(loop_4x_vec) > > > > + > > > > +L(loop_4x_done): > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + /* Restore rdi (%rdi). */ > > > > + addq %rsi, %rdi > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x0_end) > > > > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > > > > + > > > > + KMOV %k2, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x1) > > > > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > > > > + > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x2) > > > > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > > > > + /* Place L(ret_vec_x4) here to save code size. We get a > > > > + meaningfuly benefit doing this for stpcpy. */ > > > > + KMOV %k4, %VRDX > > > > +L(ret_vec_x3): > > > > + bsf %VRDX, %VRDX > > > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > # ifdef USE_AS_STPCPY > > > > - lea 2(%rdi), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub $3, %r8 > > > > - lea 3(%rdi), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax > > > > # endif > > > > +L(return_end): > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Exit3): > > > > - mov (%rsi), %edx > > > > - mov %edx, (%rdi) > > > > + .p2align 4,, 6 > > > > +L(ret_vec_x0_end): > > > > + bsf %VRCX, %VRCX > > > > # ifdef USE_AS_STPCPY > > > > - lea 3(%rdi), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub $4, %r8 > > > > - lea 4(%rdi), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rax > > > > # endif > > > > + inc %VRCX > > > > + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Exit4_7): > > > > - mov (%rsi), %ecx > > > > - mov %ecx, (%rdi) > > > > - mov -3(%rsi, %rdx), %ecx > > > > - mov %ecx, -3(%rdi, %rdx) > > > > + .p2align 4,, 8 > > > > +L(ret_vec_x1): > > > > + bsf %VRCX, %VRCX > > > > + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > # ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub %rdx, %r8 > > > > - sub $1, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > > > # endif > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Exit8_15): > > > > - mov (%rsi), %rcx > > > > - mov -7(%rsi, %rdx), %r9 > > > > - mov %rcx, (%rdi) > > > > - mov %r9, -7(%rdi, %rdx) > > > > + .p2align 4,, 4 > > > > +L(ret_vec_x2): > > > > + bsf %VRCX, %VRCX > > > > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > # ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub %rdx, %r8 > > > > - sub $1, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > > > > # endif > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Exit16_31): > > > > - VMOVU (%rsi), %XMM2 > > > > - VMOVU -15(%rsi, %rdx), %XMM3 > > > > - VMOVU %XMM2, (%rdi) > > > > - VMOVU %XMM3, -15(%rdi, %rdx) > > > > + /* ret_vec_x3 reuses return code after the loop. */ > > > > + .p2align 4,, 6 > > > > +L(ret_vec_x4): > > > > + bsf %VRCX, %VRCX > > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > # ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > -# endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub %rdx, %r8 > > > > - sub $1, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > > > > # endif > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Exit32_63): > > > > - VMOVU (%rsi), %YMM2 > > > > - VMOVU -31(%rsi, %rdx), %YMM3 > > > > - VMOVU %YMM2, (%rdi) > > > > - VMOVU %YMM3, -31(%rdi, %rdx) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > + > > > > + .p2align 4,, 4 > > > > +L(page_cross): > > > > +# ifndef USE_AS_STRCAT > > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > > # endif > > > > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > > > > - sub %rdx, %r8 > > > > - sub $1, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - jnz L(StrncpyFillTailWithZero) > > > > + movq %rsi, %rcx > > > > + andq $(VEC_SIZE * -1), %rcx > > > > + > > > > + VPCMPEQ (%rcx), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > +# ifdef USE_AS_WCSCPY > > > > + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG > > > > + shrl $2, %PAGE_ALIGN_REG > > > > # endif > > > > - ret > > > > + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX > > > > > > > > -# ifdef USE_AS_STRNCPY > > > > +# if USE_MOVSB_IN_PAGE_CROSS > > > > + /* Optimizing more aggressively for space as this is very cold > > > > + code. This saves 2x cache lines. */ > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit1): > > > > - movzbl (%rsi), %edx > > > > - mov %dl, (%rdi) > > > > -# ifdef USE_AS_STPCPY > > > > - lea 1(%rdi), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, 1(%rdi) > > > > + /* This adds once to the later result which will get correct > > > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > > > + to be in the page cross case rsi cannot be aligned and we > > > > + already right-shift rcx by the misalignment. */ > > > > + shl %VRCX > > > > + jz L(page_cross_continue) > > > > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > > > > + movq %rdi, %rax > > > > # endif > > > > - ret > > > > + bsf %VRCX, %VRCX > > > > + REP_MOVS > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit2): > > > > - movzwl (%rsi), %edx > > > > - mov %dx, (%rdi) > > > > # ifdef USE_AS_STPCPY > > > > - lea 2(%rdi), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, 2(%rdi) > > > > + leaq -CHAR_SIZE(%rdi), %rax > > > > # endif > > > > ret > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit3_4): > > > > - movzwl (%rsi), %ecx > > > > - movzwl -2(%rsi, %r8), %edx > > > > - mov %cx, (%rdi) > > > > - mov %dx, -2(%rdi, %r8) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %r8), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi, %r8) > > > > -# endif > > > > - ret > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit5_8): > > > > - mov (%rsi), %ecx > > > > - mov -4(%rsi, %r8), %edx > > > > - mov %ecx, (%rdi) > > > > - mov %edx, -4(%rdi, %r8) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %r8), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi, %r8) > > > > -# endif > > > > - ret > > > > +# else > > > > + /* Check if we found zero-char before end of page. */ > > > > + test %VRCX, %VRCX > > > > + jz L(page_cross_continue) > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit9_16): > > > > - mov (%rsi), %rcx > > > > - mov -8(%rsi, %r8), %rdx > > > > - mov %rcx, (%rdi) > > > > - mov %rdx, -8(%rdi, %r8) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %r8), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi, %r8) > > > > -# endif > > > > - ret > > > > + /* Traditional copy case, essentially same as used in non-page- > > > > + cross case but since we can't reuse VMM(0) we need twice as > > > > + many loads from rsi. */ > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit17_32): > > > > - VMOVU (%rsi), %XMM2 > > > > - VMOVU -16(%rsi, %r8), %XMM3 > > > > - VMOVU %XMM2, (%rdi) > > > > - VMOVU %XMM3, -16(%rdi, %r8) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %r8), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi, %r8) > > > > +# ifndef USE_AS_STRCAT > > > > + xorl %edx, %edx > > > > # endif > > > > - ret > > > > - > > > > - .p2align 4 > > > > -L(StrncpyExit33_64): > > > > - /* 0/32, 31/16 */ > > > > - VMOVU (%rsi), %YMM2 > > > > - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 > > > > - VMOVU %YMM2, (%rdi) > > > > - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) > > > > + /* Dependency on rdi must already have been satisfied. */ > > > > + bsf %VRCX, %VRDX > > > > # ifdef USE_AS_STPCPY > > > > - lea (%rdi, %r8), %rax > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# elif !defined USE_AS_STRCAT > > > > + movq %rdi, %rax > > > > # endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi, %r8) > > > > -# endif > > > > - ret > > > > > > > > - .p2align 4 > > > > -L(StrncpyExit65): > > > > - /* 0/32, 32/32, 64/1 */ > > > > - VMOVU (%rsi), %YMM2 > > > > - VMOVU 32(%rsi), %YMM3 > > > > - mov 64(%rsi), %cl > > > > - VMOVU %YMM2, (%rdi) > > > > - VMOVU %YMM3, 32(%rdi) > > > > - mov %cl, 64(%rdi) > > > > -# ifdef USE_AS_STPCPY > > > > - lea 65(%rdi), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, 65(%rdi) > > > > +# if VEC_SIZE == 64 > > > > +# ifdef USE_AS_WCSCPY > > > > + testb %cl, %cl > > > > +# else > > > > + test %ecx, %ecx > > > > +# endif > > > > + jz L(page_cross_copy_32_63) > > > > # endif > > > > - ret > > > > - > > > > -# ifndef USE_AS_STRCAT > > > > > > > > - .p2align 4 > > > > -L(Fill1): > > > > - mov %dl, (%rdi) > > > > - ret > > > > +# ifdef USE_AS_WCSCPY > > > > + testb $0xf, %cl > > > > +# else > > > > + testw %cx, %cx > > > > +# endif > > > > + jz L(page_cross_copy_16_31) > > > > > > > > - .p2align 4 > > > > -L(Fill2): > > > > - mov %dx, (%rdi) > > > > - ret > > > > +# ifdef USE_AS_WCSCPY > > > > + testb $0x3, %cl > > > > +# else > > > > + testb %cl, %cl > > > > +# endif > > > > + jz L(page_cross_copy_8_15) > > > > > > > > - .p2align 4 > > > > -L(Fill3_4): > > > > - mov %dx, (%rdi) > > > > - mov %dx, -2(%rdi, %r8) > > > > +# ifdef USE_AS_WCSCPY > > > > + movl (%rsi), %esi > > > > + movl %esi, (%rdi) > > > > + movl $0, (%END_REG) > > > > ret > > > > +# else > > > > > > > > - .p2align 4 > > > > -L(Fill5_8): > > > > - mov %edx, (%rdi) > > > > - mov %edx, -4(%rdi, %r8) > > > > - ret > > > > + testb $0x7, %cl > > > > + jz L(page_cross_copy_4_7) > > > > > > > > - .p2align 4 > > > > -L(Fill9_16): > > > > - mov %rdx, (%rdi) > > > > - mov %rdx, -8(%rdi, %r8) > > > > + test %edx, %edx > > > > + jz L(page_cross_set_null_term) > > > > + movzwl (%rsi), %ecx > > > > + movw %cx, (%rdi) > > > > +L(page_cross_set_null_term): > > > > + movb $0, (%END_REG) > > > > ret > > > > > > > > - .p2align 4 > > > > -L(Fill17_32): > > > > - VMOVU %XMMZERO, (%rdi) > > > > - VMOVU %XMMZERO, -16(%rdi, %r8) > > > > - ret > > > > > > > > - .p2align 4 > > > > -L(CopyVecSizeUnalignedVec2): > > > > - VMOVU %YMM2, (%rdi, %rcx) > > > > - > > > > - .p2align 4 > > > > -L(CopyVecSizeVecExit): > > > > - bsf %edx, %edx > > > > - add $(VEC_SIZE - 1), %r8 > > > > - add %rcx, %rdi > > > > -# ifdef USE_AS_STPCPY > > > > - lea (%rdi, %rdx), %rax > > > > -# endif > > > > - sub %rdx, %r8 > > > > - lea 1(%rdi, %rdx), %rdi > > > > - > > > > - .p2align 4 > > > > -L(StrncpyFillTailWithZero): > > > > - xor %edx, %edx > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(StrncpyFillExit) > > > > - > > > > - VMOVU %YMMZERO, (%rdi) > > > > - add $VEC_SIZE, %rdi > > > > - > > > > - mov %rdi, %rsi > > > > - and $(VEC_SIZE - 1), %esi > > > > - sub %rsi, %rdi > > > > - add %rsi, %r8 > > > > - sub $(VEC_SIZE * 4), %r8 > > > > - jb L(StrncpyFillLessFourVecSize) > > > > - > > > > -L(StrncpyFillLoopVmovdqa): > > > > - VMOVA %YMMZERO, (%rdi) > > > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > > > - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) > > > > - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) > > > > - add $(VEC_SIZE * 4), %rdi > > > > - sub $(VEC_SIZE * 4), %r8 > > > > - jae L(StrncpyFillLoopVmovdqa) > > > > - > > > > -L(StrncpyFillLessFourVecSize): > > > > - add $(VEC_SIZE * 2), %r8 > > > > - jl L(StrncpyFillLessTwoVecSize) > > > > - VMOVA %YMMZERO, (%rdi) > > > > - VMOVA %YMMZERO, VEC_SIZE(%rdi) > > > > - add $(VEC_SIZE * 2), %rdi > > > > - sub $VEC_SIZE, %r8 > > > > - jl L(StrncpyFillExit) > > > > - VMOVA %YMMZERO, (%rdi) > > > > - add $VEC_SIZE, %rdi > > > > - jmp L(Fill) > > > > - > > > > - .p2align 4 > > > > -L(StrncpyFillLessTwoVecSize): > > > > - add $VEC_SIZE, %r8 > > > > - jl L(StrncpyFillExit) > > > > - VMOVA %YMMZERO, (%rdi) > > > > - add $VEC_SIZE, %rdi > > > > - jmp L(Fill) > > > > - > > > > - .p2align 4 > > > > -L(StrncpyFillExit): > > > > - add $VEC_SIZE, %r8 > > > > -L(Fill): > > > > - cmp $17, %r8d > > > > - jae L(Fill17_32) > > > > - cmp $9, %r8d > > > > - jae L(Fill9_16) > > > > - cmp $5, %r8d > > > > - jae L(Fill5_8) > > > > - cmp $3, %r8d > > > > - jae L(Fill3_4) > > > > - cmp $1, %r8d > > > > - ja L(Fill2) > > > > - je L(Fill1) > > > > + .p2align 4,, 4 > > > > +L(page_cross_copy_4_7): > > > > + movl (%rsi), %ecx > > > > + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi > > > > + movl %ecx, (%rdi) > > > > + movl %esi, -(4 - CHAR_SIZE)(%END_REG) > > > > ret > > > > - > > > > -/* end of ifndef USE_AS_STRCAT */ > > > > # endif > > > > > > > > - .p2align 4 > > > > -L(UnalignedLeaveCase2OrCase3): > > > > - test %rdx, %rdx > > > > - jnz L(UnalignedFourVecSizeLeaveCase2) > > > > -L(UnalignedFourVecSizeLeaveCase3): > > > > - lea (VEC_SIZE * 4)(%r8), %rcx > > > > - and $-VEC_SIZE, %rcx > > > > - add $(VEC_SIZE * 3), %r8 > > > > - jl L(CopyVecSizeCase3) > > > > - VMOVU %YMM4, (%rdi) > > > > - sub $VEC_SIZE, %r8 > > > > - jb L(CopyVecSizeCase3) > > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > > - sub $VEC_SIZE, %r8 > > > > - jb L(CopyVecSizeCase3) > > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > > - sub $VEC_SIZE, %r8 > > > > - jb L(CopyVecSizeCase3) > > > > - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) > > > > -# ifdef USE_AS_STPCPY > > > > - lea (VEC_SIZE * 4)(%rdi), %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (VEC_SIZE * 4)(%rdi) > > > > -# endif > > > > +# if VEC_SIZE == 64 > > > > + .p2align 4,, 4 > > > > +L(page_cross_copy_32_63): > > > > + VMOVU (%rsi), %VMM_256(0) > > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > > + VMOVU %VMM_256(0), (%rdi) > > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) > > > > ret > > > > - > > > > - .p2align 4 > > > > -L(UnalignedFourVecSizeLeaveCase2): > > > > - xor %ecx, %ecx > > > > - vpcmpb $0, %YMM4, %YMMZERO, %k1 > > > > - kmovd %k1, %edx > > > > - add $(VEC_SIZE * 3), %r8 > > > > - jle L(CopyVecSizeCase2OrCase3) > > > > - test %edx, %edx > > > > -# ifndef USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec4) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > - vpcmpb $0, %YMM5, %YMMZERO, %k2 > > > > - kmovd %k2, %edx > > > > - VMOVU %YMM4, (%rdi) > > > > - add $VEC_SIZE, %rcx > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > - test %edx, %edx > > > > -# ifndef USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec5) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > # endif > > > > > > > > - vpcmpb $0, %YMM6, %YMMZERO, %k3 > > > > - kmovd %k3, %edx > > > > - VMOVU %YMM5, VEC_SIZE(%rdi) > > > > - add $VEC_SIZE, %rcx > > > > - sub $VEC_SIZE, %r8 > > > > - jbe L(CopyVecSizeCase2OrCase3) > > > > - test %edx, %edx > > > > -# ifndef USE_AS_STRCAT > > > > - jnz L(CopyVecSizeUnalignedVec6) > > > > -# else > > > > - jnz L(CopyVecSize) > > > > -# endif > > > > - > > > > - vpcmpb $0, %YMM7, %YMMZERO, %k4 > > > > - kmovd %k4, %edx > > > > - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) > > > > - lea VEC_SIZE(%rdi, %rcx), %rdi > > > > - lea VEC_SIZE(%rsi, %rcx), %rsi > > > > - bsf %edx, %edx > > > > - cmp %r8d, %edx > > > > - jb L(CopyVecSizeExit) > > > > -L(StrncpyExit): > > > > - cmp $65, %r8d > > > > - je L(StrncpyExit65) > > > > - cmp $33, %r8d > > > > - jae L(StrncpyExit33_64) > > > > - cmp $17, %r8d > > > > - jae L(StrncpyExit17_32) > > > > - cmp $9, %r8d > > > > - jae L(StrncpyExit9_16) > > > > - cmp $5, %r8d > > > > - jae L(StrncpyExit5_8) > > > > - cmp $3, %r8d > > > > - jae L(StrncpyExit3_4) > > > > - cmp $1, %r8d > > > > - ja L(StrncpyExit2) > > > > - je L(StrncpyExit1) > > > > -# ifdef USE_AS_STPCPY > > > > - mov %rdi, %rax > > > > -# endif > > > > -# ifdef USE_AS_STRCAT > > > > - movb $0, (%rdi) > > > > -# endif > > > > + .p2align 4,, 4 > > > > +L(page_cross_copy_16_31): > > > > + vmovdqu (%rsi), %xmm0 > > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > > + vmovdqu %xmm0, (%rdi) > > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) > > > > ret > > > > > > > > - .p2align 4 > > > > -L(ExitZero): > > > > -# ifndef USE_AS_STRCAT > > > > - mov %rdi, %rax > > > > -# endif > > > > + .p2align 4,, 4 > > > > +L(page_cross_copy_8_15): > > > > + movq (%rsi), %rcx > > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > > > + movq %rcx, (%rdi) > > > > + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) > > > > ret > > > > - > > > > -# endif > > > > - > > > > -# ifndef USE_AS_STRCAT > > > > -END (STRCPY) > > > > -# else > > > > -END (STRCAT) > > > > # endif > > > > +END(STRCPY) > > > > #endif > > > > diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S > > > > index 203a19bf21..d648ba5cfe 100644 > > > > --- a/sysdeps/x86_64/multiarch/strncat-evex.S > > > > +++ b/sysdeps/x86_64/multiarch/strncat-evex.S > > > > @@ -1,7 +1,520 @@ > > > > -#ifndef STRNCAT > > > > -# define STRNCAT __strncat_evex > > > > -#endif > > > > +/* {wcs|str}ncat with 256/512-bit EVEX. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > > + moment. */ > > > > +# define USE_EVEX_MASKED_STORE 0 > > > > + > > > > +# include <sysdep.h> > > > > + > > > > +# ifndef VEC_SIZE > > > > +# include "x86-evex256-vecs.h" > > > > +# endif > > > > + > > > > +# ifndef STRNCAT > > > > +# define STRNCAT __strncat_evex > > > > +# endif > > > > + > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > +# define movNULL movl > > > > +# define VMOVU_MASK vmovdqu32 > > > > +# define VPMIN vpminud > > > > +# define VPTESTN vptestnmd > > > > +# define VPTEST vptestmd > > > > +# define VPCMPEQ vpcmpeqd > > > > +# define CHAR_SIZE 4 > > > > + > > > > +# define REP_MOVS rep movsd > > > > + > > > > +# define VMASK_REG VR10 > > > > +# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst > > > > + > > > > +# define USE_WIDE_CHAR > > > > +# else > > > > +# define movNULL movb > > > > +# define VMOVU_MASK vmovdqu8 > > > > +# define VPMIN vpminub > > > > +# define VPTESTN vptestnmb > > > > +# define VPTEST vptestmb > > > > +# define VPCMPEQ vpcmpeqb > > > > +# define CHAR_SIZE 1 > > > > + > > > > +# define REP_MOVS rep movsb > > > > + > > > > +# define VMASK_REG VRCX > > > > +# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst > > > > + > > > > +# endif > > > > + > > > > +# include "strncpy-or-cat-overflow-def.h" > > > > + > > > > +# include "reg-macros.h" > > > > + > > > > + > > > > +# define VZERO VMM(7) > > > > +# define VZERO_128 VMM_128(7) > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > + > > > > + .section SECTION(.text), "ax", @progbits > > > > +ENTRY(STRNCAT) > > > > + movq %rdi, %rax > > > > + > > > > + /* NB: It's safe to filter out zero-length strings WITHOUT > > > > + setting null-term. Destination MUST be a null-terminated > > > > + string so essentially the work is already done. */ > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq -1(%rdx), %rcx > > > > + shrq $56, %rcx > > > > + jnz L(zero_len) > > > > +# else > > > > + test %rdx, %rdx > > > > + jle L(zero_len) > > > > +# endif > > > > + > > > > +# include "strcat-strlen-evex.S" > > > > + > > > > + movl %esi, %ecx > > > > + andl $(PAGE_SIZE - 1), %ecx > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > > > > + ja L(page_cross) > > > > +L(page_cross_continue): > > > > + VMOVU (%rsi), %VMM(0) > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + > > > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > > > + <= CHAR_PER_VEC with masked instructions (which have > > > > + potential for dramatically bad perf if dst splits a page and > > > > + is not in the TLB). */ > > > > +# if USE_EVEX_MASKED_STORE > > > > + KMOV %k0, %VRCX > > > > + FIND_FIRST_ONE (VRCX, VR8) > > > > + cmpq %r8, %rdx > > > > + jbe L(less_1x_vec) > > > > + > > > > + test %VRCX, %VRCX > > > > + jz L(more_1x_vec) > > > > + > > > > + blsmsk %VRCX, %VRCX > > > > + KMOV %VRCX, %k1 > > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > > + ret > > > > + > > > > +L(less_1x_vec): > > > > + mov $-1, %VRCX > > > > + bzhi %VRDX, %VRCX, %VRCX > > > > + KMOV %VRCX, %k1 > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + VMOVU_MASK %VMM(0), (%rdi){%k1} > > > > + > > > > + ret > > > > +# else > > > > + KMOV %k0, %VMASK_REG > > > > + /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf > > > > + %VMASK_REG, %VRCX` for wcsncat. */ > > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > > + cmpq %rcx, %rdx > > > > + jbe L(less_1x_vec) > > > > + > > > > + /* If there were no zero-CHARs (rcx was zero before > > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > > + cmpl $CHAR_PER_VEC, %ecx > > > > + je L(more_1x_vec) > > > > + > > > > + movl %ecx, %edx > > > > + > > > > +L(less_1x_vec): > > > > +# if VEC_SIZE == 64 > > > > + cmpl $(32 / CHAR_SIZE), %edx > > > > + jae L(copy_32_63) > > > > +# endif > > > > + > > > > + cmpl $(16 / CHAR_SIZE), %edx > > > > + jae L(copy_16_31) > > > > + > > > > + > > > > + cmpl $(8 / CHAR_SIZE), %edx > > > > + jae L(copy_8_15) > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > + vmovd %VMM_128(0), (%rdi) > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > +# else > > > > + > > > > + cmpl $4, %edx > > > > + jae L(copy_4_7) > > > > + > > > > + movzbl (%rsi), %ecx > > > > + cmpl $1, %edx > > > > + jbe L(set_null_term) > > > > + > > > > + movzwl 1(%rsi), %esi > > > > + movw %si, 1(%rdi) > > > > + > > > > + .p2align 4,, 1 > > > > +L(set_null_term): > > > > + movb %cl, (%rdi) > > > > + movNULL $0, (%rdi, %rdx) > > > > + ret > > > > +# endif > > > > + > > > > +# if VEC_SIZE == 64 > > > > + .p2align 4,, 6 > > > > +L(copy_32_63): > > > > + VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > > + VMOVU %VMM_256(0), (%rdi) > > > > + VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > +# endif > > > > + .p2align 4,, 6 > > > > +L(copy_16_31): > > > > + /* Use xmm1 explicitly here as it won't require a `vzeroupper` > > > > + and will save code size. */ > > > > + vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > > + VMOVU %VMM_128(0), (%rdi) > > > > + vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > + > > > > + .p2align 4,, 2 > > > > +L(copy_8_15): > > > > + movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx > > > > + vmovq %VMM_128(0), (%rdi) > > > > + movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > + > > > > +# ifndef USE_AS_WCSCPY > > > > + .p2align 4,, 12 > > > > +L(copy_4_7): > > > > + movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx > > > > + vmovd %VMM_128(0), (%rdi) > > > > + movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) > > > > + movNULL $0, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > +# endif > > > > + > > > > +# endif > > > > + .p2align 4,, 4 > > > > +L(zero_len): > > > > +# ifdef USE_AS_WCSCPY > > > > + test %rdx, %rdx > > > > +# endif > > > > + jne OVERFLOW_STRCAT > > > > + ret > > > > > > > > -#define USE_AS_STRNCAT > > > > -#define STRCAT STRNCAT > > > > -#include "strcat-evex.S" > > > > + .p2align 4,, 8 > > > > +L(more_1x_vec): > > > > + VMOVU %VMM(0), (%rdi) > > > > + > > > > + /* We are going to align rsi here so will need to be able to re- > > > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > > > + > > > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > > > + subq %rsi, %rdi > > > > + andq $-(VEC_SIZE), %rsi > > > > +L(loop_last_4x_vec): > > > > + addq %rsi, %rdi > > > > + subq %rsi, %rdx > > > > +# ifdef USE_AS_WCSCPY > > > > + shrq $2, %rdx > > > > +# endif > > > > + > > > > + /* Will need this regardless. */ > > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > > + KMOV %k0, %VMASK_REG > > > > + > > > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > > > + ja L(more_2x_vec) > > > > + > > > > +L(last_2x_vec): > > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_vec_x1_len) > > > > + > > > > + /* If there were no zero-CHARs (rcx was zero before > > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > > + cmpl $CHAR_PER_VEC, %ecx > > > > + jne L(ret_vec_x1) > > > > + > > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + addl $-CHAR_PER_VEC, %edx > > > > + bzhi %VRDX, %VRCX, %VR8 > > > > + jz L(ret_vec_x2_len) > > > > +L(ret_vec_x2): > > > > + bsf %VRCX, %VRDX > > > > +L(ret_vec_x2_len): > > > > + VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > > > + VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > + > > > > + .p2align 4,, 4 > > > > +L(ret_vec_x1_len): > > > > + movl %edx, %ecx > > > > +L(ret_vec_x1): > > > > + VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) > > > > + VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > + VZEROUPPER_RETURN > > > > + > > > > + > > > > + .p2align 4,, 8 > > > > +L(last_4x_vec): > > > > + addl $-(CHAR_PER_VEC * 4), %edx > > > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > > + KMOV %k0, %VMASK_REG > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpl $(CHAR_PER_VEC * 2), %edx > > > > + jbe L(last_2x_vec) > > > > + .p2align 4,, 8 > > > > +L(more_2x_vec): > > > > +# ifdef USE_AS_WCSCPY > > > > + xorl %ecx, %ecx > > > > +# endif > > > > + bsf %VMASK_REG, %VRCX > > > > + jnz L(ret_vec_x1) > > > > + > > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x2) > > > > + > > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > > + KMOV %k0, %VMASK_REG > > > > + > > > > + cmpq $(CHAR_PER_VEC * 4), %rdx > > > > + ja L(more_4x_vec) > > > > + > > > > + /* Adjust length before going to L(ret_vec_x3_len) or > > > > + L(ret_vec_x3). */ > > > > + addl $(CHAR_PER_VEC * -2), %edx > > > > + > > > > + FIND_FIRST_ONE (VMASK_REG, VRCX) > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_vec_x3_len) > > > > + > > > > + /* If there were no zero-CHARs (rcx was zero before > > > > + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ > > > > + cmpl $CHAR_PER_VEC, %ecx > > > > + jne L(ret_vec_x3) > > > > + > > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > > + KMOV %k0, %VRCX > > > > + addl $-CHAR_PER_VEC, %edx > > > > + bzhi %VRDX, %VRCX, %VR8 > > > > + jz L(ret_vec_x4_len) > > > > +L(ret_vec_x4): > > > > + bsf %VRCX, %VRDX > > > > +L(ret_vec_x4_len): > > > > + VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) > > > > + VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > + > > > > + .p2align 4,, 4 > > > > +L(ret_vec_x3_len): > > > > + movl %edx, %ecx > > > > +L(ret_vec_x3): > > > > + VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > > > + VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(more_4x_vec): > > > > +# ifdef USE_AS_WCSCPY > > > > + xorl %ecx, %ecx > > > > +# endif > > > > + bsf %VMASK_REG, %VRCX > > > > + jnz L(ret_vec_x3) > > > > + > > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x4) > > > > + > > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > > + > > > > + /* Check if we are near the end before aligning. */ > > > > + cmpq $(CHAR_PER_VEC * 8), %rdx > > > > + jbe L(last_4x_vec) > > > > + > > > > + > > > > + /* Add rsi to rdx (length) before aligning rsi. NB: Since we > > > > + filtered out huge lengths this cannot overflow. */ > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > > > +# else > > > > + addq %rsi, %rdx > > > > +# endif > > > > + > > > > + /* Subtract rsi from rdi before aligning (add back will have > > > > + correct rdi for aligned rsi). */ > > > > + subq %rsi, %rdi > > > > + subq $-(VEC_SIZE * 5), %rsi > > > > + andq $(VEC_SIZE * -4), %rsi > > > > + > > > > + /* Load first half of the loop before entry. */ > > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + > > > > + /* Offset rsi by VEC_SIZE so that we can jump to > > > > + L(loop_last_4x_vec). */ > > > > + addq $-(VEC_SIZE), %rsi > > > > + KORTEST %k2, %k4 > > > > + jnz L(loop_4x_done) > > > > + > > > > + /* Store loop end in r9. */ > > > > + leaq -(VEC_SIZE * 5)(%rdx), %r9 > > > > + > > > > + .p2align 4,, 11 > > > > +L(loop_4x_vec): > > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > > > + > > > > + subq $(VEC_SIZE * -4), %rsi > > > > + cmpq %rsi, %r9 > > > > + jbe L(loop_last_4x_vec) > > > > + > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + KORTEST %k2, %k4 > > > > + jz L(loop_4x_vec) > > > > + > > > > +L(loop_4x_done): > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + /* Restore rdi (dst). */ > > > > + addq %rsi, %rdi > > > > + > > > > + /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so > > > > + test with bsf. */ > > > > + bsf %VRCX, %VRCX > > > > + jnz L(ret_vec_x1) > > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) > > > > + > > > > + KMOV %k2, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x2) > > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > > > + > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + bsf %VRCX, %VRCX > > > > + jnz L(ret_vec_x3) > > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > > > + > > > > + KMOV %k4, %VRCX > > > > + bsf %VRCX, %VRCX > > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) > > > > + ret > > > > + > > > > + > > > > + .p2align 4,, 4 > > > > +L(page_cross): > > > > + movq %rsi, %r8 > > > > + andq $(VEC_SIZE * -1), %r8 > > > > + VPCMPEQ (%r8), %VZERO, %k0 > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > + KMOV %k0, %VR9 > > > > + shrl $2, %ecx > > > > + andl $(CHAR_PER_VEC - 1), %ecx > > > > + shrx %VRCX, %VR9, %VRCX > > > > +# else > > > > + KMOV %k0, %VRCX > > > > + shrx %VRSI, %VRCX, %VRCX > > > > +# endif > > > > + > > > > + subl %esi, %r8d > > > > + andl $(VEC_SIZE - 1), %r8d > > > > +# ifdef USE_AS_WCSCPY > > > > + shrl $2, %r8d > > > > +# endif > > > > + cmpq %r8, %rdx > > > > + jbe L(page_cross_small) > > > > + /* Optimizing more for space as this is very cold code. This > > > > + saves 2x cache lines. */ > > > > + > > > > + /* This adds once to the later result which will get correct > > > > + copy bounds. NB: this can never zero-out a non-zero RCX as > > > > + to be in the page cross case rsi cannot be aligned and we > > > > + already right-shift rcx by the misalignment. */ > > > > + shl %VRCX > > > > + jz L(page_cross_continue) > > > > + bsf %VRCX, %VRCX > > > > + REP_MOVS > > > > + ret > > > > + > > > > +L(page_cross_small): > > > > + tzcnt %VRCX, %VRCX > > > > + jz L(page_cross_setz) > > > > + cmpl %edx, %ecx > > > > + cmova %edx, %ecx > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > + rep movsd > > > > +# else > > > > + rep movsb > > > > +# endif > > > > +L(page_cross_setz): > > > > + movNULL $0, (%rdi) > > > > + ret > > > > +END(STRNCAT) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S > > > > index 1b3426d511..49eaf4cbd9 100644 > > > > --- a/sysdeps/x86_64/multiarch/strncpy-evex.S > > > > +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S > > > > @@ -1,7 +1,990 @@ > > > > -#ifndef STRNCPY > > > > -# define STRNCPY __strncpy_evex > > > > -#endif > > > > +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. > > > > + Copyright (C) 2022 Free Software Foundation, Inc. > > > > + This file is part of the GNU C Library. > > > > + > > > > + The GNU C Library is free software; you can redistribute it and/or > > > > + modify it under the terms of the GNU Lesser General Public > > > > + License as published by the Free Software Foundation; either > > > > + version 2.1 of the License, or (at your option) any later version. > > > > + > > > > + The GNU C Library is distributed in the hope that it will be useful, > > > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > > > + Lesser General Public License for more details. > > > > + > > > > + You should have received a copy of the GNU Lesser General Public > > > > + License along with the GNU C Library; if not, see > > > > + <https://www.gnu.org/licenses/>. */ > > > > + > > > > +#include <isa-level.h> > > > > + > > > > +#if ISA_SHOULD_BUILD (4) > > > > + > > > > + /* Use evex-masked stores for small sizes. Turned off at the > > > > + moment. */ > > > > +# define USE_EVEX_MASKED_STORE 0 > > > > + > > > > + > > > > +# include <sysdep.h> > > > > +# ifndef VEC_SIZE > > > > +# include "x86-evex256-vecs.h" > > > > +# endif > > > > + > > > > + > > > > +# ifndef STRNCPY > > > > +# define STRNCPY __strncpy_evex > > > > +# endif > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > +# define VMOVU_MASK vmovdqu32 > > > > +# define VPCMPEQ vpcmpeqd > > > > +# define VPMIN vpminud > > > > +# define VPTESTN vptestnmd > > > > +# define VPTEST vptestmd > > > > +# define CHAR_SIZE 4 > > > > + > > > > +# define REP_MOVS rep movsd > > > > +# define REP_STOS rep stosl > > > > + > > > > +# define USE_WIDE_CHAR > > > > + > > > > +# else > > > > +# define VMOVU_MASK vmovdqu8 > > > > +# define VPCMPEQ vpcmpeqb > > > > +# define VPMIN vpminub > > > > +# define VPTESTN vptestnmb > > > > +# define VPTEST vptestmb > > > > +# define CHAR_SIZE 1 > > > > + > > > > +# define REP_MOVS rep movsb > > > > +# define REP_STOS rep stosb > > > > +# endif > > > > + > > > > +# include "strncpy-or-cat-overflow-def.h" > > > > + > > > > +# define PAGE_SIZE 4096 > > > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > > > + > > > > +# include "reg-macros.h" > > > > + > > > > + > > > > +# define VZERO VMM(7) > > > > +# define VZERO_256 VMM_256(7) > > > > +# define VZERO_128 VMM_128(7) > > > > + > > > > +# if VEC_SIZE == 64 > > > > +# define VZERO_HALF VZERO_256 > > > > +# else > > > > +# define VZERO_HALF VZERO_128 > > > > +# endif > > > > + > > > > + .section SECTION(.text), "ax", @progbits > > > > +ENTRY(STRNCPY) > > > > + /* Filter zero length strings and very long strings. Zero > > > > + length strings just return, very long strings are handled by > > > > + just running rep stos{b|l} to zero set (which will almost > > > > + certainly segfault), if that succeeds then just calling > > > > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ > > > > +# ifdef USE_AS_WCSCPY > > > > + decq %rdx > > > > + movq %rdx, %rax > > > > + /* 56 is end of max supported address space. */ > > > > + shr $56, %rax > > > > + jnz L(zero_len) > > > > +# else > > > > + decq %rdx > > > > + /* If the flag needs to become `jb` replace `dec` with `sub`. > > > > + */ > > > > + jl L(zero_len) > > > > +# endif > > > > + > > > > + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 > > > > + movl %esi, %eax > > > > + andl $(PAGE_SIZE - 1), %eax > > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > > > + ja L(page_cross) > > > > + > > > > +L(page_cross_continue): > > > > + VMOVU (%rsi), %VMM(0) > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + > > > > + /* If no STPCPY just save end ahead of time. */ > > > > +# ifndef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > + > > > > + > > > > + cmpq $(CHAR_PER_VEC), %rdx > > > > + > > > > + /* If USE_EVEX_MASK_STORE is enabled then we just handle length > > > > + <= CHAR_PER_VEC with masked instructions (which have > > > > + potential for dramatically bad perf if dst splits a page and > > > > + is not in the TLB). */ > > > > +# if USE_EVEX_MASKED_STORE > > > > + /* `jae` because length rdx is now length - 1. */ > > > > + jae L(more_1x_vec) > > > > + > > > > + /* If there where multiple zero-CHAR matches in the first VEC, > > > > + VRCX will be overset but thats fine since any oversets where > > > > + at zero-positions anyways. */ > > > > + > > > > +# ifdef USE_AS_STPCPY > > > > + tzcnt %VRCX, %VRAX > > > > + cmpl %eax, %edx > > > > + cmovb %edx, %eax > > > > +# ifdef USE_AS_WCSCPY > > > > + adcl $0, %eax > > > > + leaq (%rdi, %rax, CHAR_SIZE), %rax > > > > +# else > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + dec %VRCX > > > > + > > > > + /* Zero out all non-zero CHAR's after the first zero match. */ > > > > + KMOV %VRCX, %k1 > > > > + > > > > + /* Use VZERO as destination so this can be reused for > > > > + L(zfill_less_vec) (which if jumped to by subsequent logic > > > > + will have zerod out VZERO. */ > > > > + VMOVU_MASK %VMM(0), %VZERO{%k1}{z} > > > > +L(zfill_less_vec): > > > > + /* Get mask for what we need to set. */ > > > > + incl %edx > > > > + mov $-1, %VRCX > > > > + bzhi %VRDX, %VRCX, %VRCX > > > > + KMOV %VRCX, %k1 > > > > + VMOVU_MASK %VZERO, (%rdi){%k1} > > > > + ret > > > > + > > > > + .p2align 4,, 4 > > > > +L(zero_len): > > > > + cmpq $-1, %rdx > > > > + jne L(best_effort_strncpy) > > > > + movq %rdi, %rax > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(more_1x_vec): > > > > +# else > > > > + /* `jb` because length rdx is now length - 1. */ > > > > + jb L(less_1x_vec) > > > > +# endif > > > > + > > > > + > > > > + /* This may overset but thats fine because we still need to zero > > > > + fill. */ > > > > + VMOVU %VMM(0), (%rdi) > > > > + > > > > + > > > > + /* Length must be >= CHAR_PER_VEC so match here means we must > > > > + zero-fill. */ > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill) > > > > + > > > > + > > > > + /* We are going to align rsi here so will need to be able to re- > > > > + adjust rdi/rdx afterwords. NB: We filtered out huge lengths > > > > + so rsi + rdx * CHAR_SIZE cannot overflow. */ > > > > + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx > > > > + subq %rsi, %rdi > > > > + andq $-(VEC_SIZE), %rsi > > > > + > > > > +L(loop_last_4x_vec): > > > > + addq %rsi, %rdi > > > > + subq %rsi, %rdx > > > > +# ifdef USE_AS_WCSCPY > > > > + shrq $2, %rdx > > > > +# endif > > > > + > > > > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) > > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > > + KMOV %k0, %VRCX > > > > + > > > > + /* -1 because of the `dec %rdx` earlier. */ > > > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > > > + ja L(more_2x_vec) > > > > + > > > > +L(last_2x_vec): > > > > + /* This will be need to be computed no matter what. We do it > > > > + ahead of time for CHAR_PER_VEC == 64 because we can't adjust > > > > + the value of `tzcnt` with a shift. */ > > > > +# if CHAR_PER_VEC == 64 > > > > + tzcntq %rcx, %rcx > > > > +# endif > > > > + > > > > + cmpl $(CHAR_PER_VEC), %edx > > > > + jb L(ret_vec_x1_len) > > > > + > > > > + /* Seperate logic for CHAR_PER_VEC == 64 because we already did > > > > + `tzcnt` on VRCX. */ > > > > +# if CHAR_PER_VEC == 64 > > > > + /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ > > > > + cmpb $CHAR_PER_VEC, %cl > > > > + jnz L(ret_vec_x1_no_bsf) > > > > +# else > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x1) > > > > +# endif > > > > + > > > > + > > > > + > > > > + VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 > > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > > + KMOV %k0, %VRCX > > > > + > > > > +# if CHAR_PER_VEC < 64 > > > > + /* This essentiallys adds CHAR_PER_VEC to computed result. */ > > > > + shlq $CHAR_PER_VEC, %rcx > > > > +# else > > > > + tzcntq %rcx, %rcx > > > > + addl $CHAR_PER_VEC, %ecx > > > > +# endif > > > > + > > > > + .p2align 4,, 4 > > > > +L(ret_vec_x1_len): > > > > + /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has > > > > + already been done. */ > > > > +# if CHAR_PER_VEC < 64 > > > > + tzcntq %rcx, %rcx > > > > +# endif > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_vec_x1_len_no_zfill) > > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > +L(ret_vec_x1_len_no_zfill_mov): > > > > + movl %ecx, %edx > > > > +# ifdef USE_AS_STPCPY > > > > + /* clear flags. */ > > > > + xorl %ecx, %ecx > > > > +# endif > > > > +L(ret_vec_x1_len_no_zfill): > > > > + VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + leal (VEC_SIZE)(%rdx), %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > + > > > > + > > > > + .p2align 4,, 10 > > > > +L(ret_vec_x1): > > > > + bsf %VRCX, %VRCX > > > > +L(ret_vec_x1_no_bsf): > > > > + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > + subl %ecx, %edx > > > > + cmpl $CHAR_PER_VEC, %edx > > > > + jb L(ret_vec_x1_len_no_zfill_mov) > > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > > + VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > + leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(last_4x_vec): > > > > + /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl > > > > + $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just > > > > + using `movzbl`. */ > > > > +# if CHAR_PER_VEC == 64 > > > > + movzbl %dl, %edx > > > > +# else > > > > + andl $(CHAR_PER_VEC * 4 - 1), %edx > > > > +# endif > > > > + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) > > > > + VPTESTN %VMM(1), %VMM(1), %k0 > > > > + KMOV %k0, %VRCX > > > > + subq $-(VEC_SIZE * 4), %rsi > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpl $(CHAR_PER_VEC * 2 - 1), %edx > > > > + jbe L(last_2x_vec) > > > > + .p2align 4,, 8 > > > > +L(more_2x_vec): > > > > + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) > > > > + test %VRCX, %VRCX > > > > + /* Must fill at least 2x VEC. */ > > > > + jnz L(zfill_vec1) > > > > + > > > > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) > > > > + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + /* Must fill at least 1x VEC. */ > > > > + jnz L(zfill_vec2) > > > > + > > > > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) > > > > + VPTESTN %VMM(3), %VMM(3), %k0 > > > > + KMOV %k0, %VRCX > > > > + > > > > + /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ > > > > + cmpq $(CHAR_PER_VEC * 4 - 1), %rdx > > > > + ja L(more_4x_vec) > > > > + > > > > + subl $(CHAR_PER_VEC * 3), %edx > > > > + jb L(ret_vec_x3_len) > > > > + > > > > + test %VRCX, %VRCX > > > > + jnz L(ret_vec_x3) > > > > + > > > > + VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + KMOV %k0, %VRCX > > > > + tzcnt %VRCX, %VRCX > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_vec_x4_len_no_zfill) > > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > > + VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > + movl %ecx, %edx > > > > +L(ret_vec_x4_len_no_zfill): > > > > + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + leal (VEC_SIZE * 4 + 0)(%rdx), %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > + > > > > + > > > > +L(ret_vec_x3_len): > > > > + addl $(CHAR_PER_VEC * 1), %edx > > > > + tzcnt %VRCX, %VRCX > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_vec_x3_len_no_zfill) > > > > + /* Fall through (expectation) is copy len < buffer len. */ > > > > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > +L(ret_vec_x3_len_no_zfill_mov): > > > > + movl %ecx, %edx > > > > +# ifdef USE_AS_STPCPY > > > > + /* clear flags. */ > > > > + xorl %ecx, %ecx > > > > +# endif > > > > + .p2align 4,, 4 > > > > +L(ret_vec_x3_len_no_zfill): > > > > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) > > > > + VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + leal (VEC_SIZE * 3 + 0)(%rdx), %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > + > > > > + > > > > + .p2align 4,, 8 > > > > +L(ret_vec_x3): > > > > + bsf %VRCX, %VRCX > > > > + VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) > > > > + subl %ecx, %edx > > > > + jl L(ret_vec_x3_len_no_zfill_mov) > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(more_4x_vec): > > > > + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill_vec3) > > > > + > > > > + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) > > > > + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) > > > > + VPTESTN %VMM(4), %VMM(4), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill_vec4) > > > > > > > > -#define USE_AS_STRNCPY > > > > -#define STRCPY STRNCPY > > > > -#include "strcpy-evex.S" > > > > + /* Recheck length before aligning. */ > > > > + cmpq $(CHAR_PER_VEC * 8 - 1), %rdx > > > > + jbe L(last_4x_vec) > > > > + > > > > + /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rsi, %rdx, CHAR_SIZE), %rdx > > > > +# else > > > > + addq %rsi, %rdx > > > > +# endif > > > > + subq %rsi, %rdi > > > > + subq $-(VEC_SIZE * 5), %rsi > > > > + andq $(VEC_SIZE * -4), %rsi > > > > + > > > > + > > > > + /* Load first half of the loop before entry. */ > > > > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + > > > > + > > > > + /* Offset rsi by VEC_SIZE so that we can jump to > > > > + L(loop_last_4x_vec). */ > > > > + addq $-(VEC_SIZE), %rsi > > > > + KORTEST %k2, %k4 > > > > + jnz L(loop_4x_done) > > > > + > > > > + /* Store loop end in r9. */ > > > > + leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 > > > > + > > > > + .p2align 4,, 11 > > > > +L(loop_4x_vec): > > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) > > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) > > > > + > > > > + subq $(VEC_SIZE * -4), %rsi > > > > + cmpq %rsi, %r9 > > > > + jbe L(loop_last_4x_vec) > > > > + > > > > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) > > > > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) > > > > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) > > > > + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) > > > > + > > > > + VPMIN %VMM(0), %VMM(1), %VMM(4) > > > > + VPMIN %VMM(2), %VMM(3), %VMM(6) > > > > + VPTESTN %VMM(4), %VMM(4), %k2 > > > > + VPTESTN %VMM(6), %VMM(6), %k4 > > > > + KORTEST %k2, %k4 > > > > + jz L(loop_4x_vec) > > > > + > > > > +L(loop_4x_done): > > > > + /* Restore rdx (length). */ > > > > + subq %rsi, %rdx > > > > +# ifdef USE_AS_WCSCPY > > > > + shrq $2, %rdx > > > > +# endif > > > > + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) > > > > + /* Restore rdi (dst). */ > > > > + addq %rsi, %rdi > > > > + VPTESTN %VMM(0), %VMM(0), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill_vec1) > > > > + > > > > + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) > > > > + KMOV %k2, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill_vec2) > > > > + > > > > + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) > > > > + VPTESTN %VMM(2), %VMM(2), %k0 > > > > + KMOV %k0, %VRCX > > > > + test %VRCX, %VRCX > > > > + jnz L(zfill_vec3) > > > > + > > > > + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) > > > > + KMOV %k4, %VRCX > > > > + // Zfill more.... > > > > + > > > > + .p2align 4,, 4 > > > > +L(zfill_vec4): > > > > + subq $(VEC_SIZE * -2), %rdi > > > > + addq $(CHAR_PER_VEC * -2), %rdx > > > > +L(zfill_vec2): > > > > + subq $(VEC_SIZE * -2), %rdi > > > > + addq $(CHAR_PER_VEC * -1), %rdx > > > > +L(zfill): > > > > + /* VRCX must be non-zero. */ > > > > + bsf %VRCX, %VRCX > > > > + > > > > + /* Adjust length / dst for zfill. */ > > > > + subq %rcx, %rdx > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > +L(zfill_from_page_cross): > > > > + > > > > + /* From here on out its just memset(rdi, 0, rdx). */ > > > > + cmpq $CHAR_PER_VEC, %rdx > > > > + jb L(zfill_less_vec) > > > > + > > > > +L(zfill_more_1x_vec): > > > > + VMOVU %VZERO, (%rdi) > > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx > > > > + ja L(zfill_more_2x_vec) > > > > +L(zfill_done0): > > > > + ret > > > > + > > > > + /* Coming from vec1/vec2 we must be able to zfill at least 2x > > > > + VEC. */ > > > > + .p2align 4,, 8 > > > > +L(zfill_vec3): > > > > + subq $(VEC_SIZE * -2), %rdi > > > > + addq $(CHAR_PER_VEC * -2), %rdx > > > > + .p2align 4,, 2 > > > > +L(zfill_vec1): > > > > + bsfq %rcx, %rcx > > > > + /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. > > > > + */ > > > > + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > > > + subq %rcx, %rdx > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > + > > > > + > > > > + VMOVU %VZERO, (%rdi) > > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + cmpq $(CHAR_PER_VEC * 2), %rdx > > > > + jb L(zfill_done0) > > > > +L(zfill_more_2x_vec): > > > > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) > > > > + VMOVU %VZERO, (VEC_SIZE)(%rdi) > > > > + subq $(CHAR_PER_VEC * 4 - 1), %rdx > > > > + jbe L(zfill_done) > > > > + > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rdx > > > > +# else > > > > + addq %rdi, %rdx > > > > +# endif > > > > + > > > > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) > > > > + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) > > > > + > > > > + > > > > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) > > > > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) > > > > + > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpq %rdi, %rdx > > > > + jbe L(zfill_done) > > > > + > > > > + /* Align rdi and zfill loop. */ > > > > + andq $-(VEC_SIZE), %rdi > > > > + .p2align 4,, 12 > > > > +L(zfill_loop_4x_vec): > > > > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) > > > > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) > > > > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) > > > > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) > > > > + subq $-(VEC_SIZE * 4), %rdi > > > > + cmpq %rdi, %rdx > > > > + ja L(zfill_loop_4x_vec) > > > > +L(zfill_done): > > > > + ret > > > > + > > > > + > > > > + /* Less 1x VEC case if we are not using evex masked store. */ > > > > +# if !USE_EVEX_MASKED_STORE > > > > + .p2align 4,, 8 > > > > +L(copy_1x): > > > > + /* Special case for copy 1x. It can be handled quickly and many > > > > + buffer sizes have convenient alignment. */ > > > > + VMOVU %VMM(0), (%rdi) > > > > + /* If no zeros then we are done. */ > > > > + testl %ecx, %ecx > > > > + jz L(ret_1x_1x) > > > > + > > > > + /* Need to zfill, not we know that length <= CHAR_PER_VEC so we > > > > + only handle the small case here. */ > > > > + bsf %VRCX, %VRCX > > > > +L(zfill_less_vec_no_bsf): > > > > + /* Adjust length / dst then just zfill less_vec. */ > > > > + subq %rcx, %rdx > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > + > > > > +L(zfill_less_vec): > > > > + cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx > > > > + jb L(zfill_less_half) > > > > + > > > > + VMOVU %VZERO_HALF, (%rdi) > > > > + VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > +# ifdef USE_AS_STPCPY > > > > +L(ret_1x_1x): > > > > + leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax > > > > + ret > > > > +# endif > > > > + > > > > + > > > > +# if VEC_SIZE == 64 > > > > + .p2align 4,, 4 > > > > +L(copy_32_63): > > > > + /* Overfill to avoid branches. */ > > > > + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) > > > > + VMOVU %VMM_256(0), (%rdi) > > > > + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + > > > > + /* We are taking advantage of the fact that to be here we must > > > > + be writing null-term as (%rdi, %rcx) we have a byte of lee- > > > > + way for overwriting. */ > > > > + cmpl %ecx, %edx > > > > + ja L(zfill_less_vec_no_bsf) > > > > +# ifndef USE_AS_STPCPY > > > > +L(ret_1x_1x): > > > > +# else > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > +# endif > > > > + > > > > + .p2align 4,, 4 > > > > +L(copy_16_31): > > > > + /* Overfill to avoid branches. */ > > > > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 > > > > + VMOVU %VMM_128(0), (%rdi) > > > > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + cmpl %ecx, %edx > > > > + > > > > + /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then > > > > + we have a larger copy block for 32-63 so this is just falls > > > > + through to zfill 16-31. If VEC_SIZE == 32 then we check for > > > > + full zfill of less 1x VEC. */ > > > > +# if VEC_SIZE == 64 > > > > + jbe L(ret_16_31) > > > > + subl %ecx, %edx > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > +L(zfill_less_half): > > > > +L(zfill_less_32): > > > > + cmpl $(16 / CHAR_SIZE), %edx > > > > + jb L(zfill_less_16) > > > > + VMOVU %VZERO_128, (%rdi) > > > > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > + ret > > > > +# endif > > > > +L(ret_16_31): > > > > +# ifdef USE_AS_STPCPY > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > +# else > > > > + /* VEC_SIZE == 32 begins. */ > > > > + ja L(zfill_less_vec_no_bsf) > > > > +# ifndef USE_AS_STPCPY > > > > +L(ret_1x_1x): > > > > +# else > > > > +# ifdef USE_AS_WCSCPY > > > > + adcq $0, %rdx > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# endif > > > > + ret > > > > +# endif > > > > + > > > > + > > > > + .p2align 4,, 4 > > > > +L(copy_8_15): > > > > + /* Overfill to avoid branches. */ > > > > + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > > > + vmovq %VMM_128(0), (%rdi) > > > > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_8_15) > > > > + subl %ecx, %edx > > > > +# ifdef USE_AS_WCSCPY > > > > + leaq (%rdi, %rcx, CHAR_SIZE), %rdi > > > > +# else > > > > + addq %rcx, %rdi > > > > +# endif > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > + .p2align 4,, 8 > > > > +# if VEC_SIZE == 32 > > > > +L(zfill_less_half): > > > > +# endif > > > > +L(zfill_less_16): > > > > + xorl %ecx, %ecx > > > > + cmpl $(8 / CHAR_SIZE), %edx > > > > + jb L(zfill_less_8) > > > > + movq %rcx, (%rdi) > > > > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) > > > > +# ifndef USE_AS_STPCPY > > > > +L(ret_8_15): > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(less_1x_vec): > > > > + je L(copy_1x) > > > > + > > > > + /* We will need `tzcnt` result for all other copy sizes. */ > > > > + tzcnt %VRCX, %VRCX > > > > +# if VEC_SIZE == 64 > > > > + cmpl $(32 / CHAR_SIZE), %edx > > > > + jae L(copy_32_63) > > > > +# endif > > > > + > > > > + cmpl $(16 / CHAR_SIZE), %edx > > > > + jae L(copy_16_31) > > > > + > > > > + cmpl $(8 / CHAR_SIZE), %edx > > > > + jae L(copy_8_15) > > > > +# ifdef USE_AS_WCSCPY > > > > + testl %ecx, %ecx > > > > + jz L(zfill_less_8_set_ret) > > > > + > > > > + movl (%rsi, %rdx, CHAR_SIZE), %esi > > > > + vmovd %VMM_128(0), (%rdi) > > > > + movl %esi, (%rdi, %rdx, CHAR_SIZE) > > > > +# ifdef USE_AS_STPCPY > > > > + cmpl %ecx, %edx > > > > +L(ret_8_15): > > > > + adcq $0, %rdx > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# endif > > > > + ret > > > > +L(zfill_less_8_set_ret): > > > > + xorl %ecx, %ecx > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > +L(zfill_less_8): > > > > + movl %ecx, (%rdi) > > > > + movl %ecx, (%rdi, %rdx, CHAR_SIZE) > > > > + ret > > > > +# else > > > > + cmpl $3, %edx > > > > + jb L(copy_0_3) > > > > + /* Overfill to avoid branches. */ > > > > + movl -3(%rsi, %rdx), %esi > > > > + vmovd %VMM_128(0), (%rdi) > > > > + movl %esi, -3(%rdi, %rdx) > > > > + cmpl %ecx, %edx > > > > + jbe L(ret_4_7) > > > > + subq %rcx, %rdx > > > > + addq %rcx, %rdi > > > > +# ifdef USE_AS_STPCPY > > > > + movq %rdi, %rax > > > > +# endif > > > > + xorl %ecx, %ecx > > > > + .p2align 4,, 8 > > > > +L(zfill_less_8): > > > > + cmpl $3, %edx > > > > + jb L(zfill_less_3) > > > > + movl %ecx, (%rdi) > > > > + movl %ecx, -3(%rdi, %rdx) > > > > +# ifdef USE_AS_STPCPY > > > > + ret > > > > +# endif > > > > + > > > > +L(ret_4_7): > > > > +# ifdef USE_AS_STPCPY > > > > +L(ret_8_15): > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > + ret > > > > + > > > > + .p2align 4,, 4 > > > > +L(zfill_less_3): > > > > + testl %edx, %edx > > > > + jz L(zfill_1) > > > > + movw %cx, (%rdi) > > > > +L(zfill_1): > > > > + movb %cl, (%rdi, %rdx) > > > > + ret > > > > + > > > > + .p2align 4,, 8 > > > > +L(copy_0_3): > > > > + vmovd %VMM_128(0), %r8d > > > > + testl %edx, %edx > > > > + jz L(copy_1) > > > > + movw %r8w, (%rdi) > > > > + cmpl %ecx, %edx > > > > + ja L(zfill_from_1) > > > > + movzbl (%rsi, %rdx), %r8d > > > > +# ifdef USE_AS_STPCPY > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > + movb %r8b, (%rdi, %rdx) > > > > + ret > > > > +# endif > > > > + > > > > +L(copy_1): > > > > +# ifdef USE_AS_STPCPY > > > > + movl %edx, %eax > > > > + cmpl %ecx, %edx > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# ifdef USE_AS_WCSCPY > > > > + vmovd %VMM_128(0), (%rdi) > > > > +# else > > > > + movb %r8b, (%rdi, %rdx) > > > > +# endif > > > > + ret > > > > +# endif > > > > + > > > > + > > > > +# ifndef USE_AS_WCSCPY > > > > + .p2align 4,, 8 > > > > +L(zfill_from_1): > > > > +# ifdef USE_AS_STPCPY > > > > + leaq (%rdi, %rcx), %rax > > > > +# endif > > > > + movw $0, -1(%rdi, %rdx) > > > > + ret > > > > +# endif > > > > + > > > > + .p2align 4,, 4 > > > > +L(zero_len): > > > > + incq %rdx > > > > + jne L(best_effort_strncpy) > > > > + movq %rdi, %rax > > > > + ret > > > > +# endif > > > > + > > > > + > > > > + .p2align 4,, 4 > > > > + .p2align 6,, 8 > > > > +L(page_cross): > > > > + movq %rsi, %rax > > > > + andq $(VEC_SIZE * -1), %rax > > > > + VPCMPEQ (%rax), %VZERO, %k0 > > > > + KMOV %k0, %VRCX > > > > +# ifdef USE_AS_WCSCPY > > > > + movl %esi, %r8d > > > > + shrl $2, %r8d > > > > + andl $(CHAR_PER_VEC - 1), %r8d > > > > + shrx %VR8, %VRCX, %VRCX > > > > +# else > > > > + shrx %VRSI, %VRCX, %VRCX > > > > +# endif > > > > + > > > > + /* Compute amount of bytes we checked. */ > > > > + subl %esi, %eax > > > > + andl $(VEC_SIZE - 1), %eax > > > > +# ifdef USE_AS_WCSCPY > > > > + shrl $2, %eax > > > > +# endif > > > > + > > > > + /* If rax > rdx then we are finishing the copy at the end of the > > > > + page. */ > > > > + cmpq %rax, %rdx > > > > + jb L(page_cross_small) > > > > + > > > > + > > > > + /* If rcx is non-zero then continue. */ > > > > + test %VRCX, %VRCX > > > > + jz L(page_cross_continue) > > > > + > > > > + /* We found zero-CHAR so need to copy then zfill (we know we > > > > + didn't cover all of length here). */ > > > > + bsf %VRCX, %VRCX > > > > +L(movsb_and_zfill): > > > > + incl %ecx > > > > + subq %rcx, %rdx > > > > +# ifdef USE_AS_STPCPY > > > > + leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax > > > > +# else > > > > + movq %rdi, %rax > > > > +# endif > > > > + > > > > + REP_MOVS > > > > +# ifdef USE_AS_WCSCPY > > > > + movl $0, (%rdi) > > > > +# else > > > > + movb $0, (%rdi) > > > > +# endif > > > > + jmp L(zfill_from_page_cross) > > > > + > > > > +L(page_cross_small): > > > > + tzcnt %VRCX, %VRCX > > > > + cmpl %ecx, %edx > > > > + jbe L(page_cross_copy_only) > > > > + > > > > + /* Do a zfill of the tail before copying. */ > > > > + movq %rdi, %r9 > > > > + xorl %eax, %eax > > > > + > > > > + movl %ecx, %r8d > > > > + > > > > + subl %ecx, %edx > > > > + leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi > > > > + movl %edx, %ecx > > > > + REP_STOS > > > > + movq %r9, %rdi > > > > + movl %r8d, %edx > > > > +L(page_cross_copy_only): > > > > + leal 1(%rdx), %ecx > > > > +# ifdef USE_AS_STPCPY > > > > +# ifdef USE_AS_WCSCPY > > > > + adcl $0, %edx > > > > + leaq (%rdi, %rdx, CHAR_SIZE), %rax > > > > +# else > > > > + movl %edx, %eax > > > > + adcq %rdi, %rax > > > > +# endif > > > > +# else > > > > + movq %rdi, %rax > > > > +# endif > > > > + REP_MOVS > > > > + ret > > > > + > > > > + > > > > +L(best_effort_strncpy): > > > > + movq %rdx, %rcx > > > > + xorl %eax, %eax > > > > + movq %rdi, %r8 > > > > + /* The length is >= 2^63. We very much so expect to segfault at > > > > + rep stos. If that doesn't happen then just strcpy to finish. > > > > + */ > > > > + REP_STOS > > > > + movq %r8, %rdi > > > > + jmp OVERFLOW_STRCPY > > > > +END(STRNCPY) > > > > +#endif > > > > diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > new file mode 100644 > > > > index 0000000000..d5ff4cbe50 > > > > --- /dev/null > > > > +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h > > > > > > Please add a copyright notice. > > > > > > > @@ -0,0 +1,65 @@ > > > > +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ > > > > +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1 > > > > + > > > > +#if defined USE_MULTIARCH && IS_IN(libc) > > > > +# define UNDERSCORES __ > > > > +# ifdef USE_WITH_SSE2 > > > > +# define ISA_EXT _sse2 > > > > +# elif defined USE_WITH_AVX > > > > +# ifdef USE_WITH_RTM > > > > +# define ISA_EXT _avx_rtm > > > > +# else > > > > +# define ISA_EXT _avx > > > > +# endif > > > > +# elif defined USE_WITH_AVX2 > > > > > > Do we have a function with both AVX and AVX2 versions? If not, should > > > keep just 1. > > > > > > > +# ifdef USE_WITH_RTM > > > > +# define ISA_EXT _avx2_rtm > > > > +# else > > > > +# define ISA_EXT _avx2 > > > > +# endif > > > > + > > > > +# elif defined USE_WITH_EVEX256 > > > > +# define ISA_EXT _evex > > > > +# elif defined USE_WITH_EVEX512 > > > > +# define ISA_EXT _evex512 > > > > +# endif > > > > +#else > > > > +# define UNDERSCORES > > > > +# define ISA_EXT > > > > +#endif > > > > + > > > > +#ifdef USE_AS_WCSCPY > > > > +# define STRCPY_PREFIX wc > > > > +# define STRCAT_PREFIX wcs > > > > +# ifdef USE_AS_STPCPY > > > > +# define STRCPY_POSTFIX pcpy > > > > +# else > > > > +# define STRCPY_POSTFIX scpy > > > > +# endif > > > > +#else > > > > +# define STRCPY_PREFIX st > > > > +# define STRCAT_PREFIX str > > > > +# ifdef USE_AS_STPCPY > > > > +# define STRCPY_POSTFIX pcpy > > > > +# else > > > > +# define STRCPY_POSTFIX rcpy > > > > +# endif > > > > +#endif > > > > +#define STRCAT_POSTFIX cat > > > > + > > > > +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \ > > > > + underscores##prefix##postfix##ext > > > > + > > > > +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__) > > > > + > > > > +#ifndef OVERFLOW_STRCPY > > > > +# define OVERFLOW_STRCPY \ > > > > + OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT) > > > > +#endif > > > > + > > > > +#ifndef OVERFLOW_STRCAT > > > > +# define OVERFLOW_STRCAT \ > > > > + OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT) > > > > +#endif > > > > + > > > > +#endif > > > > -- > > > > 2.34.1 > > > > > > > > > > H.J. > > > > -- > H.J.
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S index 99ea76a372..3693491baa 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S @@ -3,6 +3,5 @@ #endif #define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY STPNCPY -#include "strcpy-evex.S" +#define STRNCPY STPNCPY +#include "strncpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S index 0e2df947e9..b4207b7889 100644 --- a/sysdeps/x86_64/multiarch/strcat-evex.S +++ b/sysdeps/x86_64/multiarch/strcat-evex.S @@ -1,286 +1,7 @@ -/* strcat with 256-bit EVEX instructions. - Copyright (C) 2021-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (4) - - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_evex -# endif - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -/* zero register */ -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM0 ymm17 -# define YMM1 ymm18 - -# define USE_AS_STRCAT - -/* Number of bytes in a vector register */ -# define VEC_SIZE 32 - - .section .text.evex,"ax",@progbits -ENTRY (STRCAT) - mov %rdi, %r9 -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - xor %eax, %eax - mov %edi, %ecx - and $((VEC_SIZE * 4) - 1), %ecx - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - cmp $(VEC_SIZE * 3), %ecx - ja L(fourth_vector_boundary) - vpcmpb $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_first_vector) - mov %rdi, %rax - and $-VEC_SIZE, %rax - jmp L(align_vec_size_start) -L(fourth_vector_boundary): - mov %rdi, %rax - and $-VEC_SIZE, %rax - vpcmpb $0, (%rax), %YMMZERO, %k0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - kmovd %k0, %edx - and %r10d, %edx - jnz L(exit) - -L(align_vec_size_start): - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 4), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - kmovd %k4, %edx - add $(VEC_SIZE * 4), %rax - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 4), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 5), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - add $VEC_SIZE, %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - add $VEC_SIZE, %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 - add $VEC_SIZE, %rax - kmovd %k1, %edx - test %edx, %edx - jnz L(exit) - - add $VEC_SIZE, %rax - - .p2align 4 -L(align_four_vec_loop): - VMOVA (%rax), %YMM0 - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 - vpminub %YMM0, %YMM1, %YMM0 - /* If K0 != 0, there is a null byte. */ - vpcmpb $0, %YMM0, %YMMZERO, %k0 - add $(VEC_SIZE * 4), %rax - ktestd %k0, %k0 - jz L(align_four_vec_loop) - - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 - sub $(VEC_SIZE * 5), %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_null_on_first_vector): - bsf %rdx, %rdx - add %rdx, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_second_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $VEC_SIZE, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_third_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 2), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fourth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 3), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fifth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - - .p2align 4 -L(StartStrcpyPart): - lea (%r9, %rax), %rdi - mov %rsi, %rcx - mov %r9, %rax /* save result */ - -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(ExitZero) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-evex.S" +#ifndef STRCAT +# define STRCAT __strcat_evex #endif + +#define USE_AS_STRCAT +#define STRCPY STRCAT +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S new file mode 100644 index 0000000000..9530d7b683 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.S @@ -0,0 +1,110 @@ +/* strlen used for begining of str{n}cat using EVEX 256/512. + Copyright (C) 2011-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +/* NOTE: This file is meant to be included by strcat-evex or + strncat-evex and does not standalone. Before including %rdi + must be saved in %rax. */ + + +/* Simple strlen implementation that ends at + L(strcat_strlen_done). */ + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 + movq %rdi, %r8 + andq $(VEC_SIZE * -1), %r8 + VPCMPEQ (%r8), %VZERO, %k0 + KMOV %k0, %VRCX +#ifdef USE_AS_WCSCPY + subl %r8d, %edi + shrl $2, %edi +#endif + shrx %VRDI, %VRCX, %VRCX +#ifdef USE_AS_WCSCPY + movq %rax, %rdi +#endif + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + leaq (VEC_SIZE)(%r8), %rdi + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v1) + + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v2) + + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v3) + + andq $-(VEC_SIZE * 4), %rdi + .p2align 4,, 8 +L(loop_2x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) + VPTESTN %VMM(1), %VMM(1), %k1 + VPTESTN %VMM(3), %VMM(3), %k3 + subq $(VEC_SIZE * -4), %rdi + KORTEST %k1, %k3 + jz L(loop_2x_vec) + + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + KMOV %k1, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v1) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v2) + + KMOV %k3, %VRCX +L(bsf_and_done_v3): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v2): + bsf %VRCX, %VRCX + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi + jmp L(strcat_strlen_done) + + .p2align 4,, 4 +L(bsf_and_done_v1): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v0): + bsf %VRCX, %VRCX +#ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +#else + addq %rcx, %rdi +#endif +L(strcat_strlen_done): diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S index 82e45ac675..1ba0195ed2 100644 --- a/sysdeps/x86_64/multiarch/strcpy-evex.S +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -1,4 +1,4 @@ -/* strcpy with 256-bit EVEX instructions. +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,990 +17,526 @@ <https://www.gnu.org/licenses/>. */ #include <isa-level.h> - #if ISA_SHOULD_BUILD (4) -# ifndef USE_AS_STRCAT -# include <sysdep.h> + /* Use evex-masked stores for small sizes. Turned off at the + moment. */ +# define USE_EVEX_MASKED_STORE 0 + /* Use movsb in page cross case to save code size. */ +# define USE_MOVSB_IN_PAGE_CROSS 1 -# ifndef STRCPY -# define STRCPY __strcpy_evex -# endif +# include <sysdep.h> +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -/* Number of bytes in a vector register */ -# ifndef VEC_SIZE -# define VEC_SIZE 32 +# ifndef STRCPY +# define STRCPY __strcpy_evex # endif -# define XMM2 xmm18 -# define XMM3 xmm19 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 -# define YMM7 ymm23 +# ifdef USE_AS_WCSCPY +# define VMOVU_MASK vmovdqu32 +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 -# ifndef USE_AS_STRCAT +# define REP_MOVS rep movsd -/* zero register */ -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM1 ymm17 - - .section .text.evex,"ax",@progbits -ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP - test %R8_LP, %R8_LP - jz L(ExitZero) -# endif - mov %rsi, %rcx -# ifndef USE_AS_STPCPY - mov %rdi, %rax /* save result */ -# endif +# define USE_WIDE_CHAR +# else +# define VMOVU_MASK vmovdqu8 +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 - vpxorq %XMMZERO, %XMMZERO, %XMMZERO +# define REP_MOVS rep movsb # endif - and $((VEC_SIZE * 4) - 1), %ecx - cmp $(VEC_SIZE * 2), %ecx - jbe L(SourceStringAlignmentLessTwoVecSize) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - - vpcmpb $0, (%rsi), %YMMZERO, %k0 - kmovd %k0, %edx - shr %cl, %rdx +# include "reg-macros.h" -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - mov $VEC_SIZE, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# else - mov $(VEC_SIZE + 1), %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# endif - jbe L(CopyVecSizeTailCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyVecSizeTail) - - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 - kmovd %k1, %edx -# ifdef USE_AS_STRNCPY - add $VEC_SIZE, %r10 - cmp %r10, %r8 - jbe L(CopyTwoVecSizeCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize) - - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ - VMOVU %YMM2, (%rdi) - -/* If source address alignment != destination address alignment */ - .p2align 4 -L(UnalignVecSizeBoth): - sub %rcx, %rdi -# ifdef USE_AS_STRNCPY - add %rcx, %r8 - sbb %rcx, %rcx - or %rcx, %r8 -# endif - mov $VEC_SIZE, %rcx - VMOVA (%rsi, %rcx), %YMM2 - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 3), %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) +# ifdef USE_AS_STPCPY +# define END_REG rax # else - jnz L(CopyVecSize) +# define END_REG rdi, %rdx, CHAR_SIZE # endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) +# ifdef USE_AS_STRCAT +# define PAGE_ALIGN_REG edx +# define PAGE_ALIGN_REG_64 rdx # else - jnz L(CopyVecSize) +# define PAGE_ALIGN_REG eax +# define PAGE_ALIGN_REG_64 rax # endif - VMOVU %YMM3, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 - vpcmpb $0, %YMM4, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) - VMOVU %YMM4, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - VMOVU %YMM2, (%rdi, %rcx) - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM3, (%rdi, %rcx) - mov %rsi, %rdx - lea VEC_SIZE(%rsi, %rcx), %rsi - and $-(VEC_SIZE * 4), %rsi - sub %rsi, %rdx - sub %rdx, %rdi -# ifdef USE_AS_STRNCPY - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 -# endif -L(UnalignedFourVecSizeLoop): - VMOVA (%rsi), %YMM4 - VMOVA VEC_SIZE(%rsi), %YMM5 - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM5, %YMM4, %YMM2 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) + .section SECTION(.text), "ax", @progbits +ENTRY(STRCPY) +# ifdef USE_AS_STRCAT + movq %rdi, %rax +# include "strcat-strlen-evex.S" # endif - test %edx, %edx - jnz L(UnalignedFourVecSizeLeave) - -L(UnalignedFourVecSizeLoop_start): - add $(VEC_SIZE * 4), %rdi - add $(VEC_SIZE * 4), %rsi - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) - VMOVA (%rsi), %YMM4 - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) - VMOVA VEC_SIZE(%rsi), %YMM5 - vpminub %YMM5, %YMM4, %YMM2 - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVU %YMM7, -VEC_SIZE(%rdi) - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) + + movl %esi, %PAGE_ALIGN_REG + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG + ja L(page_cross) +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax # endif - test %edx, %edx - jz L(UnalignedFourVecSizeLoop_start) -L(UnalignedFourVecSizeLeave): - vpcmpb $0, %YMM4, %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_0) - vpcmpb $0, %YMM5, %YMMZERO, %k2 - kmovd %k2, %ecx - test %ecx, %ecx - jnz L(CopyVecSizeUnaligned_16) + /* Two short string implementations. One with traditional + branching approach and one with masked instructions (which + have potential for dramatically bad perf if dst splits a + page and is not in the TLB). */ +# if USE_EVEX_MASKED_STORE + VPTEST %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WCSCPY + subl $((1 << CHAR_PER_VEC)- 1), %VRCX +# else + inc %VRCX +# endif + jz L(more_1x_vec) + KMOV %VRCX, %k1 + KXOR %k0, %k1, %k1 - vpcmpb $0, %YMM6, %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_32) - - vpcmpb $0, %YMM7, %YMMZERO, %k4 - kmovd %k4, %ecx - bsf %ecx, %edx - VMOVU %YMM4, (%rdi) - VMOVU %YMM5, VEC_SIZE(%rdi) - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax -# endif - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $(VEC_SIZE * 3), %rsi - add $(VEC_SIZE * 3), %rdi - jmp L(CopyVecSizeExit) -# endif + VMOVU_MASK %VMM(0), (%rdi){%k1} -/* If source address alignment == destination address alignment */ +# ifdef USE_AS_STPCPY + bsf %VRCX, %VRCX + leaq (%rdi, %rcx, CHAR_SIZE), %rax +# endif + ret -L(SourceStringAlignmentLessTwoVecSize): - VMOVU (%rsi), %YMM3 - VMOVU VEC_SIZE(%rsi), %YMM2 - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx +# else + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jz L(more_1x_vec) -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $VEC_SIZE, %r8 + xorl %edx, %edx + bsf %VRCX, %VRDX +# ifdef USE_AS_STPCPY + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# endif + + /* Use mask bits in rcx to detect which copy we need. If the low + mask is zero then there must be a bit set in the upper half. + I.e if rcx != 0 and ecx == 0, then match must be upper 32 + bits so we use L(copy_32_63). */ +# if VEC_SIZE == 64 +# ifdef USE_AS_WCSCPY + testb %cl, %cl +# else + testl %ecx, %ecx +# endif + jz L(copy_32_63) +# endif + +# ifdef USE_AS_WCSCPY + testb $0xf, %cl # else - cmp $(VEC_SIZE + 1), %r8 + testw %cx, %cx # endif - jbe L(CopyVecSizeTail1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyVecSizeTail1) + jz L(copy_16_31) - VMOVU %YMM3, (%rdi) - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $(VEC_SIZE * 2), %r8 +# ifdef USE_AS_WCSCPY + testb $0x3, %cl # else - cmp $((VEC_SIZE * 2) + 1), %r8 + testb %cl, %cl # endif - jbe L(CopyTwoVecSize1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize1) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - jmp L(UnalignVecSizeBoth) + jz L(copy_8_15) -/*------End of main part with loops---------------------*/ -/* Case1 */ +# ifdef USE_AS_WCSCPY + vmovd %VMM_128(0), (%rdi) + /* No need to copy, we know its zero. */ + movl $0, (%END_REG) -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) - .p2align 4 -L(CopyVecSize): - add %rcx, %rdi -# endif -L(CopyVecSizeTail): - add %rcx, %rsi -L(CopyVecSizeTail1): - bsf %edx, %edx -L(CopyVecSizeExit): - cmp $32, %edx - jae L(Exit32_63) - cmp $16, %edx - jae L(Exit16_31) - cmp $8, %edx - jae L(Exit8_15) - cmp $4, %edx - jae L(Exit4_7) - cmp $3, %edx - je L(Exit3) - cmp $1, %edx - ja L(Exit2) - je L(Exit1) - movb $0, (%rdi) -# ifdef USE_AS_STPCPY - lea (%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $1, %r8 - lea 1(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif ret +# else - .p2align 4 -L(CopyTwoVecSize1): - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $VEC_SIZE, %r8 -# endif - jmp L(CopyVecSizeTail1) - - .p2align 4 -L(CopyTwoVecSize): - bsf %edx, %edx - add %rcx, %rsi - add $VEC_SIZE, %edx - sub %ecx, %edx - jmp L(CopyVecSizeExit) - - .p2align 4 -L(CopyVecSizeUnaligned_0): - bsf %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - VMOVU %YMM4, (%rdi) - add $((VEC_SIZE * 4) - 1), %r8 - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - jmp L(CopyVecSizeExit) -# endif + testb $0x7, %cl + jz L(copy_4_7) - .p2align 4 -L(CopyVecSizeUnaligned_16): - bsf %ecx, %edx - VMOVU %YMM4, (%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea VEC_SIZE(%rdi, %rdx), %rax -# endif - VMOVU %YMM5, VEC_SIZE(%rdi) - add $((VEC_SIZE * 3) - 1), %r8 - sub %rdx, %r8 - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi - jmp L(CopyVecSizeExit) -# endif - .p2align 4 -L(CopyVecSizeUnaligned_32): - bsf %edx, %edx - VMOVU %YMM4, (%rdi) - VMOVU %YMM5, VEC_SIZE(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax -# endif - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - add $((VEC_SIZE * 2) - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $(VEC_SIZE * 2), %rsi - add $(VEC_SIZE * 2), %rdi - jmp L(CopyVecSizeExit) -# endif + test %edx, %edx + jz L(set_null_term) -# ifdef USE_AS_STRNCPY -# ifndef USE_AS_STRCAT - .p2align 4 -L(CopyVecSizeUnalignedVec6): - VMOVU %YMM6, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec5): - VMOVU %YMM5, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec4): - VMOVU %YMM4, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec3): - VMOVU %YMM3, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) + /* NB: make this `vmovw` if support for AVX512-FP16 is added. + */ + vmovd %VMM_128(0), %esi + movw %si, (%rdi) + + .p2align 4,, 1 +L(set_null_term): + /* No need to copy, we know its zero. */ + movb $0, (%END_REG) + ret # endif -/* Case2 */ - - .p2align 4 -L(CopyVecSizeCase2): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2): - add %rcx, %rsi - bsf %edx, %edx - add $VEC_SIZE, %edx - sub %ecx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTailCase2): - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTail1Case2): - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -/* Case2 or Case3, Case3 */ - - .p2align 4 -L(CopyVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeCase2) -L(CopyVecSizeCase3): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyTwoVecSizeCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyVecSizeTailCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTailCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSize1Case2OrCase3): - add $VEC_SIZE, %rdi - add $VEC_SIZE, %rsi - sub $VEC_SIZE, %r8 -L(CopyVecSizeTail1Case2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTail1Case2) - jmp L(StrncpyExit) +# if VEC_SIZE == 64 + .p2align 4,, 6 +L(copy_32_63): + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) + ret +# endif + + + .p2align 4,, 6 +L(copy_16_31): + /* Use xmm1 explicitly here as it won't require a `vzeroupper` + and will save code size. */ + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 + VMOVU %VMM_128(0), (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) + ret + + .p2align 4,, 8 +L(copy_8_15): +# ifdef USE_AS_WCSCPY + movl -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx +# else + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rcx +# endif + vmovq %VMM_128(0), (%rdi) + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) + ret # endif -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ - .p2align 4 -L(Exit1): - movzwl (%rsi), %edx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax +# ifndef USE_AS_WCSCPY + .p2align 4,, 12 +L(copy_4_7): + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %ecx + vmovd %VMM_128(0), (%rdi) + movl %ecx, -(4 - CHAR_SIZE)(%END_REG) + ret # endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $2, %r8 - lea 2(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + + + .p2align 4,, 8 +L(more_1x_vec): +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + VMOVU %VMM(0), (%rdi) # endif - ret + subq %rsi, %rdi + andq $-(VEC_SIZE), %rsi + addq %rsi, %rdi + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) - .p2align 4 -L(Exit2): - movzwl (%rsi), %ecx - mov %cx, (%rdi) - movb $0, 2(%rdi) + /* Ideally we store after moves to minimize impact of potential + false-dependencies. */ +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + VMOVU %VMM(0), (%rax) +# endif + + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) + VMOVU %VMM(1), VEC_SIZE(%rdi) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) + + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x4) + + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) + + + /* Align for 4x loop. */ + subq %rsi, %rdi + + /* + VEC_SIZE * 5 because we never added the original VEC_SIZE + we covered before aligning. */ + subq $-(VEC_SIZE * 5), %rsi + andq $-(VEC_SIZE * 4), %rsi + + + /* Load first half of the loop before entry. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jnz L(loop_4x_done) + + .p2align 4,, 11 +L(loop_4x_vec): + + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi, %rsi) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi, %rsi) + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi, %rsi) + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi, %rsi) + + subq $(VEC_SIZE * -4), %rsi + + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jz L(loop_4x_vec) + +L(loop_4x_done): + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + /* Restore rdi (%rdi). */ + addq %rsi, %rdi + test %VRCX, %VRCX + jnz L(ret_vec_x0_end) + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + + KMOV %k2, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x1) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + /* Place L(ret_vec_x4) here to save code size. We get a + meaningfuly benefit doing this for stpcpy. */ + KMOV %k4, %VRDX +L(ret_vec_x3): + bsf %VRDX, %VRDX + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 3 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $3, %r8 - lea 3(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 3 + 0)(%rdi, %rdx, CHAR_SIZE), %rax # endif +L(return_end): ret - .p2align 4 -L(Exit3): - mov (%rsi), %edx - mov %edx, (%rdi) + .p2align 4,, 6 +L(ret_vec_x0_end): + bsf %VRCX, %VRCX # ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $4, %r8 - lea 4(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (%rdi, %rcx, CHAR_SIZE), %rax # endif + inc %VRCX + VMOVU (-(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), (-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) ret - .p2align 4 -L(Exit4_7): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov -3(%rsi, %rdx), %ecx - mov %ecx, -3(%rdi, %rdx) + .p2align 4,, 8 +L(ret_vec_x1): + bsf %VRCX, %VRCX + VMOVU (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), (VEC_SIZE -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit8_15): - mov (%rsi), %rcx - mov -7(%rsi, %rdx), %r9 - mov %rcx, (%rdi) - mov %r9, -7(%rdi, %rdx) + .p2align 4,, 4 +L(ret_vec_x2): + bsf %VRCX, %VRCX + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit16_31): - VMOVU (%rsi), %XMM2 - VMOVU -15(%rsi, %rdx), %XMM3 - VMOVU %XMM2, (%rdi) - VMOVU %XMM3, -15(%rdi, %rdx) + /* ret_vec_x3 reuses return code after the loop. */ + .p2align 4,, 6 +L(ret_vec_x4): + bsf %VRCX, %VRCX + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax # endif ret - .p2align 4 -L(Exit32_63): - VMOVU (%rsi), %YMM2 - VMOVU -31(%rsi, %rdx), %YMM3 - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, -31(%rdi, %rdx) -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax + + .p2align 4,, 4 +L(page_cross): +# ifndef USE_AS_STRCAT + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 # endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + movq %rsi, %rcx + andq $(VEC_SIZE * -1), %rcx + + VPCMPEQ (%rcx), %VZERO, %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WCSCPY + andl $(VEC_SIZE - 1), %PAGE_ALIGN_REG + shrl $2, %PAGE_ALIGN_REG # endif - ret + shrx %VGPR(PAGE_ALIGN_REG_64), %VRCX, %VRCX -# ifdef USE_AS_STRNCPY +# if USE_MOVSB_IN_PAGE_CROSS + /* Optimizing more aggressively for space as this is very cold + code. This saves 2x cache lines. */ - .p2align 4 -L(StrncpyExit1): - movzbl (%rsi), %edx - mov %dl, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 1(%rdi) + /* This adds once to the later result which will get correct + copy bounds. NB: this can never zero-out a non-zero RCX as + to be in the page cross case rsi cannot be aligned and we + already right-shift rcx by the misalignment. */ + shl %VRCX + jz L(page_cross_continue) +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax # endif - ret + bsf %VRCX, %VRCX + REP_MOVS - .p2align 4 -L(StrncpyExit2): - movzwl (%rsi), %edx - mov %dx, (%rdi) # ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 2(%rdi) + leaq -CHAR_SIZE(%rdi), %rax # endif ret - .p2align 4 -L(StrncpyExit3_4): - movzwl (%rsi), %ecx - movzwl -2(%rsi, %r8), %edx - mov %cx, (%rdi) - mov %dx, -2(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret - .p2align 4 -L(StrncpyExit5_8): - mov (%rsi), %ecx - mov -4(%rsi, %r8), %edx - mov %ecx, (%rdi) - mov %edx, -4(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret +# else + /* Check if we found zero-char before end of page. */ + test %VRCX, %VRCX + jz L(page_cross_continue) - .p2align 4 -L(StrncpyExit9_16): - mov (%rsi), %rcx - mov -8(%rsi, %r8), %rdx - mov %rcx, (%rdi) - mov %rdx, -8(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret + /* Traditional copy case, essentially same as used in non-page- + cross case but since we can't reuse VMM(0) we need twice as + many loads from rsi. */ - .p2align 4 -L(StrncpyExit17_32): - VMOVU (%rsi), %XMM2 - VMOVU -16(%rsi, %r8), %XMM3 - VMOVU %XMM2, (%rdi) - VMOVU %XMM3, -16(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) +# ifndef USE_AS_STRCAT + xorl %edx, %edx # endif - ret - - .p2align 4 -L(StrncpyExit33_64): - /* 0/32, 31/16 */ - VMOVU (%rsi), %YMM2 - VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) + /* Dependency on rdi must already have been satisfied. */ + bsf %VRCX, %VRDX # ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# elif !defined USE_AS_STRCAT + movq %rdi, %rax # endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - ret - .p2align 4 -L(StrncpyExit65): - /* 0/32, 32/32, 64/1 */ - VMOVU (%rsi), %YMM2 - VMOVU 32(%rsi), %YMM3 - mov 64(%rsi), %cl - VMOVU %YMM2, (%rdi) - VMOVU %YMM3, 32(%rdi) - mov %cl, 64(%rdi) -# ifdef USE_AS_STPCPY - lea 65(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 65(%rdi) +# if VEC_SIZE == 64 +# ifdef USE_AS_WCSCPY + testb %cl, %cl +# else + test %ecx, %ecx +# endif + jz L(page_cross_copy_32_63) # endif - ret - -# ifndef USE_AS_STRCAT - .p2align 4 -L(Fill1): - mov %dl, (%rdi) - ret +# ifdef USE_AS_WCSCPY + testb $0xf, %cl +# else + testw %cx, %cx +# endif + jz L(page_cross_copy_16_31) - .p2align 4 -L(Fill2): - mov %dx, (%rdi) - ret +# ifdef USE_AS_WCSCPY + testb $0x3, %cl +# else + testb %cl, %cl +# endif + jz L(page_cross_copy_8_15) - .p2align 4 -L(Fill3_4): - mov %dx, (%rdi) - mov %dx, -2(%rdi, %r8) +# ifdef USE_AS_WCSCPY + movl (%rsi), %esi + movl %esi, (%rdi) + movl $0, (%END_REG) ret +# else - .p2align 4 -L(Fill5_8): - mov %edx, (%rdi) - mov %edx, -4(%rdi, %r8) - ret + testb $0x7, %cl + jz L(page_cross_copy_4_7) - .p2align 4 -L(Fill9_16): - mov %rdx, (%rdi) - mov %rdx, -8(%rdi, %r8) + test %edx, %edx + jz L(page_cross_set_null_term) + movzwl (%rsi), %ecx + movw %cx, (%rdi) +L(page_cross_set_null_term): + movb $0, (%END_REG) ret - .p2align 4 -L(Fill17_32): - VMOVU %XMMZERO, (%rdi) - VMOVU %XMMZERO, -16(%rdi, %r8) - ret - .p2align 4 -L(CopyVecSizeUnalignedVec2): - VMOVU %YMM2, (%rdi, %rcx) - - .p2align 4 -L(CopyVecSizeVecExit): - bsf %edx, %edx - add $(VEC_SIZE - 1), %r8 - add %rcx, %rdi -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - - .p2align 4 -L(StrncpyFillTailWithZero): - xor %edx, %edx - sub $VEC_SIZE, %r8 - jbe L(StrncpyFillExit) - - VMOVU %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - - mov %rdi, %rsi - and $(VEC_SIZE - 1), %esi - sub %rsi, %rdi - add %rsi, %r8 - sub $(VEC_SIZE * 4), %r8 - jb L(StrncpyFillLessFourVecSize) - -L(StrncpyFillLoopVmovdqa): - VMOVA %YMMZERO, (%rdi) - VMOVA %YMMZERO, VEC_SIZE(%rdi) - VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) - VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE * 4), %rdi - sub $(VEC_SIZE * 4), %r8 - jae L(StrncpyFillLoopVmovdqa) - -L(StrncpyFillLessFourVecSize): - add $(VEC_SIZE * 2), %r8 - jl L(StrncpyFillLessTwoVecSize) - VMOVA %YMMZERO, (%rdi) - VMOVA %YMMZERO, VEC_SIZE(%rdi) - add $(VEC_SIZE * 2), %rdi - sub $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - VMOVA %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillLessTwoVecSize): - add $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - VMOVA %YMMZERO, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillExit): - add $VEC_SIZE, %r8 -L(Fill): - cmp $17, %r8d - jae L(Fill17_32) - cmp $9, %r8d - jae L(Fill9_16) - cmp $5, %r8d - jae L(Fill5_8) - cmp $3, %r8d - jae L(Fill3_4) - cmp $1, %r8d - ja L(Fill2) - je L(Fill1) + .p2align 4,, 4 +L(page_cross_copy_4_7): + movl (%rsi), %ecx + movl -(4 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %esi + movl %ecx, (%rdi) + movl %esi, -(4 - CHAR_SIZE)(%END_REG) ret - -/* end of ifndef USE_AS_STRCAT */ # endif - .p2align 4 -L(UnalignedLeaveCase2OrCase3): - test %rdx, %rdx - jnz L(UnalignedFourVecSizeLeaveCase2) -L(UnalignedFourVecSizeLeaveCase3): - lea (VEC_SIZE * 4)(%r8), %rcx - and $-VEC_SIZE, %rcx - add $(VEC_SIZE * 3), %r8 - jl L(CopyVecSizeCase3) - VMOVU %YMM4, (%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM5, VEC_SIZE(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 4)(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (VEC_SIZE * 4)(%rdi) -# endif +# if VEC_SIZE == 64 + .p2align 4,, 4 +L(page_cross_copy_32_63): + VMOVU (%rsi), %VMM_256(0) + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%END_REG) ret - - .p2align 4 -L(UnalignedFourVecSizeLeaveCase2): - xor %ecx, %ecx - vpcmpb $0, %YMM4, %YMMZERO, %k1 - kmovd %k1, %edx - add $(VEC_SIZE * 3), %r8 - jle L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif - vpcmpb $0, %YMM5, %YMMZERO, %k2 - kmovd %k2, %edx - VMOVU %YMM4, (%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec5) -# else - jnz L(CopyVecSize) # endif - vpcmpb $0, %YMM6, %YMMZERO, %k3 - kmovd %k3, %edx - VMOVU %YMM5, VEC_SIZE(%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec6) -# else - jnz L(CopyVecSize) -# endif - - vpcmpb $0, %YMM7, %YMMZERO, %k4 - kmovd %k4, %edx - VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) - lea VEC_SIZE(%rdi, %rcx), %rdi - lea VEC_SIZE(%rsi, %rcx), %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) -L(StrncpyExit): - cmp $65, %r8d - je L(StrncpyExit65) - cmp $33, %r8d - jae L(StrncpyExit33_64) - cmp $17, %r8d - jae L(StrncpyExit17_32) - cmp $9, %r8d - jae L(StrncpyExit9_16) - cmp $5, %r8d - jae L(StrncpyExit5_8) - cmp $3, %r8d - jae L(StrncpyExit3_4) - cmp $1, %r8d - ja L(StrncpyExit2) - je L(StrncpyExit1) -# ifdef USE_AS_STPCPY - mov %rdi, %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi) -# endif + .p2align 4,, 4 +L(page_cross_copy_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%END_REG) ret - .p2align 4 -L(ExitZero): -# ifndef USE_AS_STRCAT - mov %rdi, %rax -# endif + .p2align 4,, 4 +L(page_cross_copy_8_15): + movq (%rsi), %rcx + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi + movq %rcx, (%rdi) + movq %rsi, -(8 - CHAR_SIZE)(%END_REG) ret - -# endif - -# ifndef USE_AS_STRCAT -END (STRCPY) -# else -END (STRCAT) # endif +END(STRCPY) #endif diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S index 203a19bf21..d648ba5cfe 100644 --- a/sysdeps/x86_64/multiarch/strncat-evex.S +++ b/sysdeps/x86_64/multiarch/strncat-evex.S @@ -1,7 +1,520 @@ -#ifndef STRNCAT -# define STRNCAT __strncat_evex -#endif +/* {wcs|str}ncat with 256/512-bit EVEX. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + + /* Use evex-masked stores for small sizes. Turned off at the + moment. */ +# define USE_EVEX_MASKED_STORE 0 + +# include <sysdep.h> + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + +# ifndef STRNCAT +# define STRNCAT __strncat_evex +# endif + + +# ifdef USE_AS_WCSCPY +# define movNULL movl +# define VMOVU_MASK vmovdqu32 +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 + +# define REP_MOVS rep movsd + +# define VMASK_REG VR10 +# define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst + +# define USE_WIDE_CHAR +# else +# define movNULL movb +# define VMOVU_MASK vmovdqu8 +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 + +# define REP_MOVS rep movsb + +# define VMASK_REG VRCX +# define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst + +# endif + +# include "strncpy-or-cat-overflow-def.h" + +# include "reg-macros.h" + + +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text), "ax", @progbits +ENTRY(STRNCAT) + movq %rdi, %rax + + /* NB: It's safe to filter out zero-length strings WITHOUT + setting null-term. Destination MUST be a null-terminated + string so essentially the work is already done. */ +# ifdef USE_AS_WCSCPY + leaq -1(%rdx), %rcx + shrq $56, %rcx + jnz L(zero_len) +# else + test %rdx, %rdx + jle L(zero_len) +# endif + +# include "strcat-strlen-evex.S" + + movl %esi, %ecx + andl $(PAGE_SIZE - 1), %ecx + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx + ja L(page_cross) +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) + VPTESTN %VMM(0), %VMM(0), %k0 + + /* If USE_EVEX_MASK_STORE is enabled then we just handle length + <= CHAR_PER_VEC with masked instructions (which have + potential for dramatically bad perf if dst splits a page and + is not in the TLB). */ +# if USE_EVEX_MASKED_STORE + KMOV %k0, %VRCX + FIND_FIRST_ONE (VRCX, VR8) + cmpq %r8, %rdx + jbe L(less_1x_vec) + + test %VRCX, %VRCX + jz L(more_1x_vec) + + blsmsk %VRCX, %VRCX + KMOV %VRCX, %k1 + VMOVU_MASK %VMM(0), (%rdi){%k1} + ret + +L(less_1x_vec): + mov $-1, %VRCX + bzhi %VRDX, %VRCX, %VRCX + KMOV %VRCX, %k1 + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + VMOVU_MASK %VMM(0), (%rdi){%k1} + + ret +# else + KMOV %k0, %VMASK_REG + /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf + %VMASK_REG, %VRCX` for wcsncat. */ + FIND_FIRST_ONE (VMASK_REG, VRCX) + cmpq %rcx, %rdx + jbe L(less_1x_vec) + + /* If there were no zero-CHARs (rcx was zero before + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ + cmpl $CHAR_PER_VEC, %ecx + je L(more_1x_vec) + + movl %ecx, %edx + +L(less_1x_vec): +# if VEC_SIZE == 64 + cmpl $(32 / CHAR_SIZE), %edx + jae L(copy_32_63) +# endif + + cmpl $(16 / CHAR_SIZE), %edx + jae L(copy_16_31) + + + cmpl $(8 / CHAR_SIZE), %edx + jae L(copy_8_15) + +# ifdef USE_AS_WCSCPY + vmovd %VMM_128(0), (%rdi) + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + ret +# else + + cmpl $4, %edx + jae L(copy_4_7) + + movzbl (%rsi), %ecx + cmpl $1, %edx + jbe L(set_null_term) + + movzwl 1(%rsi), %esi + movw %si, 1(%rdi) + + .p2align 4,, 1 +L(set_null_term): + movb %cl, (%rdi) + movNULL $0, (%rdi, %rdx) + ret +# endif + +# if VEC_SIZE == 64 + .p2align 4,, 6 +L(copy_32_63): + VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE) + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + ret +# endif + .p2align 4,, 6 +L(copy_16_31): + /* Use xmm1 explicitly here as it won't require a `vzeroupper` + and will save code size. */ + vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1 + VMOVU %VMM_128(0), (%rdi) + vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE) + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + ret + + .p2align 4,, 2 +L(copy_8_15): + movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx + vmovq %VMM_128(0), (%rdi) + movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE) + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + ret + +# ifndef USE_AS_WCSCPY + .p2align 4,, 12 +L(copy_4_7): + movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx + vmovd %VMM_128(0), (%rdi) + movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE) + movNULL $0, (%rdi, %rdx, CHAR_SIZE) + ret +# endif + +# endif + .p2align 4,, 4 +L(zero_len): +# ifdef USE_AS_WCSCPY + test %rdx, %rdx +# endif + jne OVERFLOW_STRCAT + ret -#define USE_AS_STRNCAT -#define STRCAT STRNCAT -#include "strcat-evex.S" + .p2align 4,, 8 +L(more_1x_vec): + VMOVU %VMM(0), (%rdi) + + /* We are going to align rsi here so will need to be able to re- + adjust rdi/rdx afterwords. NB: We filtered out huge lengths + so rsi + rdx * CHAR_SIZE cannot overflow. */ + + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx + subq %rsi, %rdi + andq $-(VEC_SIZE), %rsi +L(loop_last_4x_vec): + addq %rsi, %rdi + subq %rsi, %rdx +# ifdef USE_AS_WCSCPY + shrq $2, %rdx +# endif + + /* Will need this regardless. */ + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VMASK_REG + + cmpq $(CHAR_PER_VEC * 2), %rdx + ja L(more_2x_vec) + +L(last_2x_vec): + FIND_FIRST_ONE (VMASK_REG, VRCX) + cmpl %ecx, %edx + jbe L(ret_vec_x1_len) + + /* If there were no zero-CHARs (rcx was zero before + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ + cmpl $CHAR_PER_VEC, %ecx + jne L(ret_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + addl $-CHAR_PER_VEC, %edx + bzhi %VRDX, %VRCX, %VR8 + jz L(ret_vec_x2_len) +L(ret_vec_x2): + bsf %VRCX, %VRDX +L(ret_vec_x2_len): + VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + movNULL $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) + ret + + .p2align 4,, 4 +L(ret_vec_x1_len): + movl %edx, %ecx +L(ret_vec_x1): + VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + movNULL $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE) + VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) + VZEROUPPER_RETURN + + + .p2align 4,, 8 +L(last_4x_vec): + addl $-(CHAR_PER_VEC * 4), %edx + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VMASK_REG + subq $-(VEC_SIZE * 4), %rsi + subq $-(VEC_SIZE * 4), %rdi + cmpl $(CHAR_PER_VEC * 2), %edx + jbe L(last_2x_vec) + .p2align 4,, 8 +L(more_2x_vec): +# ifdef USE_AS_WCSCPY + xorl %ecx, %ecx +# endif + bsf %VMASK_REG, %VRCX + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VMASK_REG + + cmpq $(CHAR_PER_VEC * 4), %rdx + ja L(more_4x_vec) + + /* Adjust length before going to L(ret_vec_x3_len) or + L(ret_vec_x3). */ + addl $(CHAR_PER_VEC * -2), %edx + + FIND_FIRST_ONE (VMASK_REG, VRCX) + cmpl %ecx, %edx + jbe L(ret_vec_x3_len) + + /* If there were no zero-CHARs (rcx was zero before + FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */ + cmpl $CHAR_PER_VEC, %ecx + jne L(ret_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + addl $-CHAR_PER_VEC, %edx + bzhi %VRDX, %VRCX, %VR8 + jz L(ret_vec_x4_len) +L(ret_vec_x4): + bsf %VRCX, %VRDX +L(ret_vec_x4_len): + VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + movNULL $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE) + VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE) + ret + + .p2align 4,, 4 +L(ret_vec_x3_len): + movl %edx, %ecx +L(ret_vec_x3): + VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + movNULL $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) + VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE) + ret + + .p2align 4,, 8 +L(more_4x_vec): +# ifdef USE_AS_WCSCPY + xorl %ecx, %ecx +# endif + bsf %VMASK_REG, %VRCX + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x4) + + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) + + /* Check if we are near the end before aligning. */ + cmpq $(CHAR_PER_VEC * 8), %rdx + jbe L(last_4x_vec) + + + /* Add rsi to rdx (length) before aligning rsi. NB: Since we + filtered out huge lengths this cannot overflow. */ +# ifdef USE_AS_WCSCPY + leaq (%rsi, %rdx, CHAR_SIZE), %rdx +# else + addq %rsi, %rdx +# endif + + /* Subtract rsi from rdi before aligning (add back will have + correct rdi for aligned rsi). */ + subq %rsi, %rdi + subq $-(VEC_SIZE * 5), %rsi + andq $(VEC_SIZE * -4), %rsi + + /* Load first half of the loop before entry. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + + /* Offset rsi by VEC_SIZE so that we can jump to + L(loop_last_4x_vec). */ + addq $-(VEC_SIZE), %rsi + KORTEST %k2, %k4 + jnz L(loop_4x_done) + + /* Store loop end in r9. */ + leaq -(VEC_SIZE * 5)(%rdx), %r9 + + .p2align 4,, 11 +L(loop_4x_vec): + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) + + subq $(VEC_SIZE * -4), %rsi + cmpq %rsi, %r9 + jbe L(loop_last_4x_vec) + + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jz L(loop_4x_vec) + +L(loop_4x_done): + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + /* Restore rdi (dst). */ + addq %rsi, %rdi + + /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so + test with bsf. */ + bsf %VRCX, %VRCX + jnz L(ret_vec_x1) + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi) + + KMOV %k2, %VRCX + test %VRCX, %VRCX + jnz L(ret_vec_x2) + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + bsf %VRCX, %VRCX + jnz L(ret_vec_x3) + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) + + KMOV %k4, %VRCX + bsf %VRCX, %VRCX + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE) + ret + + + .p2align 4,, 4 +L(page_cross): + movq %rsi, %r8 + andq $(VEC_SIZE * -1), %r8 + VPCMPEQ (%r8), %VZERO, %k0 + +# ifdef USE_AS_WCSCPY + KMOV %k0, %VR9 + shrl $2, %ecx + andl $(CHAR_PER_VEC - 1), %ecx + shrx %VRCX, %VR9, %VRCX +# else + KMOV %k0, %VRCX + shrx %VRSI, %VRCX, %VRCX +# endif + + subl %esi, %r8d + andl $(VEC_SIZE - 1), %r8d +# ifdef USE_AS_WCSCPY + shrl $2, %r8d +# endif + cmpq %r8, %rdx + jbe L(page_cross_small) + /* Optimizing more for space as this is very cold code. This + saves 2x cache lines. */ + + /* This adds once to the later result which will get correct + copy bounds. NB: this can never zero-out a non-zero RCX as + to be in the page cross case rsi cannot be aligned and we + already right-shift rcx by the misalignment. */ + shl %VRCX + jz L(page_cross_continue) + bsf %VRCX, %VRCX + REP_MOVS + ret + +L(page_cross_small): + tzcnt %VRCX, %VRCX + jz L(page_cross_setz) + cmpl %edx, %ecx + cmova %edx, %ecx + +# ifdef USE_AS_WCSCPY + rep movsd +# else + rep movsb +# endif +L(page_cross_setz): + movNULL $0, (%rdi) + ret +END(STRNCAT) +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S index 1b3426d511..49eaf4cbd9 100644 --- a/sysdeps/x86_64/multiarch/strncpy-evex.S +++ b/sysdeps/x86_64/multiarch/strncpy-evex.S @@ -1,7 +1,990 @@ -#ifndef STRNCPY -# define STRNCPY __strncpy_evex -#endif +/* {wcs|wcp|str|stp}ncpy with 256/512-bit EVEX instructions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (4) + + /* Use evex-masked stores for small sizes. Turned off at the + moment. */ +# define USE_EVEX_MASKED_STORE 0 + + +# include <sysdep.h> +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + + +# ifndef STRNCPY +# define STRNCPY __strncpy_evex +# endif + +# ifdef USE_AS_WCSCPY +# define VMOVU_MASK vmovdqu32 +# define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define CHAR_SIZE 4 + +# define REP_MOVS rep movsd +# define REP_STOS rep stosl + +# define USE_WIDE_CHAR + +# else +# define VMOVU_MASK vmovdqu8 +# define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define CHAR_SIZE 1 + +# define REP_MOVS rep movsb +# define REP_STOS rep stosb +# endif + +# include "strncpy-or-cat-overflow-def.h" + +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# include "reg-macros.h" + + +# define VZERO VMM(7) +# define VZERO_256 VMM_256(7) +# define VZERO_128 VMM_128(7) + +# if VEC_SIZE == 64 +# define VZERO_HALF VZERO_256 +# else +# define VZERO_HALF VZERO_128 +# endif + + .section SECTION(.text), "ax", @progbits +ENTRY(STRNCPY) + /* Filter zero length strings and very long strings. Zero + length strings just return, very long strings are handled by + just running rep stos{b|l} to zero set (which will almost + certainly segfault), if that succeeds then just calling + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ +# ifdef USE_AS_WCSCPY + decq %rdx + movq %rdx, %rax + /* 56 is end of max supported address space. */ + shr $56, %rax + jnz L(zero_len) +# else + decq %rdx + /* If the flag needs to become `jb` replace `dec` with `sub`. + */ + jl L(zero_len) +# endif + + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 + movl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + + /* If no STPCPY just save end ahead of time. */ +# ifndef USE_AS_STPCPY + movq %rdi, %rax +# endif + + + cmpq $(CHAR_PER_VEC), %rdx + + /* If USE_EVEX_MASK_STORE is enabled then we just handle length + <= CHAR_PER_VEC with masked instructions (which have + potential for dramatically bad perf if dst splits a page and + is not in the TLB). */ +# if USE_EVEX_MASKED_STORE + /* `jae` because length rdx is now length - 1. */ + jae L(more_1x_vec) + + /* If there where multiple zero-CHAR matches in the first VEC, + VRCX will be overset but thats fine since any oversets where + at zero-positions anyways. */ + +# ifdef USE_AS_STPCPY + tzcnt %VRCX, %VRAX + cmpl %eax, %edx + cmovb %edx, %eax +# ifdef USE_AS_WCSCPY + adcl $0, %eax + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + adcq %rdi, %rax +# endif +# endif + dec %VRCX + + /* Zero out all non-zero CHAR's after the first zero match. */ + KMOV %VRCX, %k1 + + /* Use VZERO as destination so this can be reused for + L(zfill_less_vec) (which if jumped to by subsequent logic + will have zerod out VZERO. */ + VMOVU_MASK %VMM(0), %VZERO{%k1}{z} +L(zfill_less_vec): + /* Get mask for what we need to set. */ + incl %edx + mov $-1, %VRCX + bzhi %VRDX, %VRCX, %VRCX + KMOV %VRCX, %k1 + VMOVU_MASK %VZERO, (%rdi){%k1} + ret + + .p2align 4,, 4 +L(zero_len): + cmpq $-1, %rdx + jne L(best_effort_strncpy) + movq %rdi, %rax + ret + + .p2align 4,, 8 +L(more_1x_vec): +# else + /* `jb` because length rdx is now length - 1. */ + jb L(less_1x_vec) +# endif + + + /* This may overset but thats fine because we still need to zero + fill. */ + VMOVU %VMM(0), (%rdi) + + + /* Length must be >= CHAR_PER_VEC so match here means we must + zero-fill. */ + test %VRCX, %VRCX + jnz L(zfill) + + + /* We are going to align rsi here so will need to be able to re- + adjust rdi/rdx afterwords. NB: We filtered out huge lengths + so rsi + rdx * CHAR_SIZE cannot overflow. */ + leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx + subq %rsi, %rdi + andq $-(VEC_SIZE), %rsi + +L(loop_last_4x_vec): + addq %rsi, %rdi + subq %rsi, %rdx +# ifdef USE_AS_WCSCPY + shrq $2, %rdx +# endif + + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + + /* -1 because of the `dec %rdx` earlier. */ + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx + ja L(more_2x_vec) + +L(last_2x_vec): + /* This will be need to be computed no matter what. We do it + ahead of time for CHAR_PER_VEC == 64 because we can't adjust + the value of `tzcnt` with a shift. */ +# if CHAR_PER_VEC == 64 + tzcntq %rcx, %rcx +# endif + + cmpl $(CHAR_PER_VEC), %edx + jb L(ret_vec_x1_len) + + /* Seperate logic for CHAR_PER_VEC == 64 because we already did + `tzcnt` on VRCX. */ +# if CHAR_PER_VEC == 64 + /* cl == CHAR_PER_VEC iff it was zero before the `tzcnt`. */ + cmpb $CHAR_PER_VEC, %cl + jnz L(ret_vec_x1_no_bsf) +# else + test %VRCX, %VRCX + jnz L(ret_vec_x1) +# endif + + + + VPCMPEQ (VEC_SIZE * 2)(%rsi), %VZERO, %k0 + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + KMOV %k0, %VRCX + +# if CHAR_PER_VEC < 64 + /* This essentiallys adds CHAR_PER_VEC to computed result. */ + shlq $CHAR_PER_VEC, %rcx +# else + tzcntq %rcx, %rcx + addl $CHAR_PER_VEC, %ecx +# endif + + .p2align 4,, 4 +L(ret_vec_x1_len): + /* If CHAR_PER_VEC < 64 we still need to tzcnt, otherwise it has + already been done. */ +# if CHAR_PER_VEC < 64 + tzcntq %rcx, %rcx +# endif + cmpl %ecx, %edx + jbe L(ret_vec_x1_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) +L(ret_vec_x1_len_no_zfill_mov): + movl %ecx, %edx +# ifdef USE_AS_STPCPY + /* clear flags. */ + xorl %ecx, %ecx +# endif +L(ret_vec_x1_len_no_zfill): + VMOVU ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax +# else + leal (VEC_SIZE)(%rdx), %eax + adcq %rdi, %rax +# endif +# endif + ret + + + .p2align 4,, 10 +L(ret_vec_x1): + bsf %VRCX, %VRCX +L(ret_vec_x1_no_bsf): + VMOVU %VZERO, ((VEC_SIZE)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) + subl %ecx, %edx + cmpl $CHAR_PER_VEC, %edx + jb L(ret_vec_x1_len_no_zfill_mov) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + VMOVU %VZERO, (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE) +# ifdef USE_AS_STPCPY + leaq (VEC_SIZE * 1)(%rdi, %rcx, CHAR_SIZE), %rax +# endif + ret + + .p2align 4,, 8 +L(last_4x_vec): + /* Seperate logic for CHAR_PER_VEC == 64 because we can do `andl + $(CHAR_PER_VEC * 4 - 1), %edx` with less code size just + using `movzbl`. */ +# if CHAR_PER_VEC == 64 + movzbl %dl, %edx +# else + andl $(CHAR_PER_VEC * 4 - 1), %edx +# endif + VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + subq $-(VEC_SIZE * 4), %rsi + subq $-(VEC_SIZE * 4), %rdi + cmpl $(CHAR_PER_VEC * 2 - 1), %edx + jbe L(last_2x_vec) + .p2align 4,, 8 +L(more_2x_vec): + VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi) + test %VRCX, %VRCX + /* Must fill at least 2x VEC. */ + jnz L(zfill_vec1) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2) + VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + /* Must fill at least 1x VEC. */ + jnz L(zfill_vec2) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3) + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRCX + + /* Check if len is more 4x VEC. -1 because rdx is len - 1. */ + cmpq $(CHAR_PER_VEC * 4 - 1), %rdx + ja L(more_4x_vec) + + subl $(CHAR_PER_VEC * 3), %edx + jb L(ret_vec_x3_len) + + test %VRCX, %VRCX + jnz L(ret_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rsi), %VZERO, %k0 + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + KMOV %k0, %VRCX + tzcnt %VRCX, %VRCX + cmpl %ecx, %edx + jbe L(ret_vec_x4_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) + movl %ecx, %edx +L(ret_vec_x4_len_no_zfill): + VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rax +# else + leal (VEC_SIZE * 4 + 0)(%rdx), %eax + adcq %rdi, %rax +# endif +# endif + ret + + +L(ret_vec_x3_len): + addl $(CHAR_PER_VEC * 1), %edx + tzcnt %VRCX, %VRCX + cmpl %ecx, %edx + jbe L(ret_vec_x3_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) +L(ret_vec_x3_len_no_zfill_mov): + movl %ecx, %edx +# ifdef USE_AS_STPCPY + /* clear flags. */ + xorl %ecx, %ecx +# endif + .p2align 4,, 4 +L(ret_vec_x3_len_no_zfill): + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0) + VMOVU %VMM(0), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx, CHAR_SIZE) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax +# else + leal (VEC_SIZE * 3 + 0)(%rdx), %eax + adcq %rdi, %rax +# endif +# endif + ret + + + .p2align 4,, 8 +L(ret_vec_x3): + bsf %VRCX, %VRCX + VMOVU %VZERO, (VEC_SIZE * 4 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx, CHAR_SIZE) + subl %ecx, %edx + jl L(ret_vec_x3_len_no_zfill_mov) + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE) +# ifdef USE_AS_STPCPY + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax +# endif + ret + + .p2align 4,, 8 +L(more_4x_vec): + VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) + test %VRCX, %VRCX + jnz L(zfill_vec3) + + VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4) + VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(zfill_vec4) -#define USE_AS_STRNCPY -#define STRCPY STRNCPY -#include "strcpy-evex.S" + /* Recheck length before aligning. */ + cmpq $(CHAR_PER_VEC * 8 - 1), %rdx + jbe L(last_4x_vec) + + /* Align rsi to VEC_SIZE * 4, need to readjust rdx / rdi. */ +# ifdef USE_AS_WCSCPY + leaq (%rsi, %rdx, CHAR_SIZE), %rdx +# else + addq %rsi, %rdx +# endif + subq %rsi, %rdi + subq $-(VEC_SIZE * 5), %rsi + andq $(VEC_SIZE * -4), %rsi + + + /* Load first half of the loop before entry. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + + + /* Offset rsi by VEC_SIZE so that we can jump to + L(loop_last_4x_vec). */ + addq $-(VEC_SIZE), %rsi + KORTEST %k2, %k4 + jnz L(loop_4x_done) + + /* Store loop end in r9. */ + leaq -(VEC_SIZE * 5 - CHAR_SIZE)(%rdx), %r9 + + .p2align 4,, 11 +L(loop_4x_vec): + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi) + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi) + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi) + + subq $(VEC_SIZE * -4), %rsi + cmpq %rsi, %r9 + jbe L(loop_last_4x_vec) + + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPTESTN %VMM(4), %VMM(4), %k2 + VPTESTN %VMM(6), %VMM(6), %k4 + KORTEST %k2, %k4 + jz L(loop_4x_vec) + +L(loop_4x_done): + /* Restore rdx (length). */ + subq %rsi, %rdx +# ifdef USE_AS_WCSCPY + shrq $2, %rdx +# endif + VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi) + /* Restore rdi (dst). */ + addq %rsi, %rdi + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(zfill_vec1) + + VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi) + KMOV %k2, %VRCX + test %VRCX, %VRCX + jnz L(zfill_vec2) + + VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi) + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(zfill_vec3) + + VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi) + KMOV %k4, %VRCX + // Zfill more.... + + .p2align 4,, 4 +L(zfill_vec4): + subq $(VEC_SIZE * -2), %rdi + addq $(CHAR_PER_VEC * -2), %rdx +L(zfill_vec2): + subq $(VEC_SIZE * -2), %rdi + addq $(CHAR_PER_VEC * -1), %rdx +L(zfill): + /* VRCX must be non-zero. */ + bsf %VRCX, %VRCX + + /* Adjust length / dst for zfill. */ + subq %rcx, %rdx +# ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_from_page_cross): + + /* From here on out its just memset(rdi, 0, rdx). */ + cmpq $CHAR_PER_VEC, %rdx + jb L(zfill_less_vec) + +L(zfill_more_1x_vec): + VMOVU %VZERO, (%rdi) + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) + cmpq $(CHAR_PER_VEC * 2 - 1), %rdx + ja L(zfill_more_2x_vec) +L(zfill_done0): + ret + + /* Coming from vec1/vec2 we must be able to zfill at least 2x + VEC. */ + .p2align 4,, 8 +L(zfill_vec3): + subq $(VEC_SIZE * -2), %rdi + addq $(CHAR_PER_VEC * -2), %rdx + .p2align 4,, 2 +L(zfill_vec1): + bsfq %rcx, %rcx + /* rdi is currently dst - VEC_SIZE so add back VEC_SIZE here. + */ + leaq VEC_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi + subq %rcx, %rdx +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + + + VMOVU %VZERO, (%rdi) + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx, CHAR_SIZE) + cmpq $(CHAR_PER_VEC * 2), %rdx + jb L(zfill_done0) +L(zfill_more_2x_vec): + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE) + VMOVU %VZERO, (VEC_SIZE)(%rdi) + subq $(CHAR_PER_VEC * 4 - 1), %rdx + jbe L(zfill_done) + +# ifdef USE_AS_WCSCPY + leaq (%rdi, %rdx, CHAR_SIZE), %rdx +# else + addq %rdi, %rdx +# endif + + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi) + VMOVU %VZERO, (VEC_SIZE * 3)(%rdi) + + + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) + + subq $-(VEC_SIZE * 4), %rdi + cmpq %rdi, %rdx + jbe L(zfill_done) + + /* Align rdi and zfill loop. */ + andq $-(VEC_SIZE), %rdi + .p2align 4,, 12 +L(zfill_loop_4x_vec): + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpq %rdi, %rdx + ja L(zfill_loop_4x_vec) +L(zfill_done): + ret + + + /* Less 1x VEC case if we are not using evex masked store. */ +# if !USE_EVEX_MASKED_STORE + .p2align 4,, 8 +L(copy_1x): + /* Special case for copy 1x. It can be handled quickly and many + buffer sizes have convenient alignment. */ + VMOVU %VMM(0), (%rdi) + /* If no zeros then we are done. */ + testl %ecx, %ecx + jz L(ret_1x_1x) + + /* Need to zfill, not we know that length <= CHAR_PER_VEC so we + only handle the small case here. */ + bsf %VRCX, %VRCX +L(zfill_less_vec_no_bsf): + /* Adjust length / dst then just zfill less_vec. */ + subq %rcx, %rdx +# ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + +L(zfill_less_vec): + cmpl $((VEC_SIZE / 2) / CHAR_SIZE), %edx + jb L(zfill_less_half) + + VMOVU %VZERO_HALF, (%rdi) + VMOVU %VZERO_HALF, -((VEC_SIZE / 2)- CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + ret +# ifdef USE_AS_STPCPY +L(ret_1x_1x): + leaq CHAR_SIZE(%rdi, %rdx, CHAR_SIZE), %rax + ret +# endif + + +# if VEC_SIZE == 64 + .p2align 4,, 4 +L(copy_32_63): + /* Overfill to avoid branches. */ + VMOVU -(32 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1) + VMOVU %VMM_256(0), (%rdi) + VMOVU %VMM_256(1), -(32 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + + /* We are taking advantage of the fact that to be here we must + be writing null-term as (%rdi, %rcx) we have a byte of lee- + way for overwriting. */ + cmpl %ecx, %edx + ja L(zfill_less_vec_no_bsf) +# ifndef USE_AS_STPCPY +L(ret_1x_1x): +# else +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# endif + ret +# endif + + .p2align 4,, 4 +L(copy_16_31): + /* Overfill to avoid branches. */ + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %xmm1 + VMOVU %VMM_128(0), (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + cmpl %ecx, %edx + + /* Seperate logic depending on VEC_SIZE. If VEC_SIZE == 64 then + we have a larger copy block for 32-63 so this is just falls + through to zfill 16-31. If VEC_SIZE == 32 then we check for + full zfill of less 1x VEC. */ +# if VEC_SIZE == 64 + jbe L(ret_16_31) + subl %ecx, %edx +# ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_less_half): +L(zfill_less_32): + cmpl $(16 / CHAR_SIZE), %edx + jb L(zfill_less_16) + VMOVU %VZERO_128, (%rdi) + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) +# ifdef USE_AS_STPCPY + ret +# endif +L(ret_16_31): +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# endif + ret +# else + /* VEC_SIZE == 32 begins. */ + ja L(zfill_less_vec_no_bsf) +# ifndef USE_AS_STPCPY +L(ret_1x_1x): +# else +# ifdef USE_AS_WCSCPY + adcq $0, %rdx + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# endif + ret +# endif + + + .p2align 4,, 4 +L(copy_8_15): + /* Overfill to avoid branches. */ + movq -(8 - CHAR_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi + vmovq %VMM_128(0), (%rdi) + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) + cmpl %ecx, %edx + jbe L(ret_8_15) + subl %ecx, %edx +# ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +# else + addq %rcx, %rdi +# endif +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + .p2align 4,, 8 +# if VEC_SIZE == 32 +L(zfill_less_half): +# endif +L(zfill_less_16): + xorl %ecx, %ecx + cmpl $(8 / CHAR_SIZE), %edx + jb L(zfill_less_8) + movq %rcx, (%rdi) + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE) +# ifndef USE_AS_STPCPY +L(ret_8_15): +# endif + ret + + .p2align 4,, 8 +L(less_1x_vec): + je L(copy_1x) + + /* We will need `tzcnt` result for all other copy sizes. */ + tzcnt %VRCX, %VRCX +# if VEC_SIZE == 64 + cmpl $(32 / CHAR_SIZE), %edx + jae L(copy_32_63) +# endif + + cmpl $(16 / CHAR_SIZE), %edx + jae L(copy_16_31) + + cmpl $(8 / CHAR_SIZE), %edx + jae L(copy_8_15) +# ifdef USE_AS_WCSCPY + testl %ecx, %ecx + jz L(zfill_less_8_set_ret) + + movl (%rsi, %rdx, CHAR_SIZE), %esi + vmovd %VMM_128(0), (%rdi) + movl %esi, (%rdi, %rdx, CHAR_SIZE) +# ifdef USE_AS_STPCPY + cmpl %ecx, %edx +L(ret_8_15): + adcq $0, %rdx + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# endif + ret +L(zfill_less_8_set_ret): + xorl %ecx, %ecx +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_less_8): + movl %ecx, (%rdi) + movl %ecx, (%rdi, %rdx, CHAR_SIZE) + ret +# else + cmpl $3, %edx + jb L(copy_0_3) + /* Overfill to avoid branches. */ + movl -3(%rsi, %rdx), %esi + vmovd %VMM_128(0), (%rdi) + movl %esi, -3(%rdi, %rdx) + cmpl %ecx, %edx + jbe L(ret_4_7) + subq %rcx, %rdx + addq %rcx, %rdi +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + xorl %ecx, %ecx + .p2align 4,, 8 +L(zfill_less_8): + cmpl $3, %edx + jb L(zfill_less_3) + movl %ecx, (%rdi) + movl %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + ret +# endif + +L(ret_4_7): +# ifdef USE_AS_STPCPY +L(ret_8_15): + movl %edx, %eax + adcq %rdi, %rax +# endif + ret + + .p2align 4,, 4 +L(zfill_less_3): + testl %edx, %edx + jz L(zfill_1) + movw %cx, (%rdi) +L(zfill_1): + movb %cl, (%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_0_3): + vmovd %VMM_128(0), %r8d + testl %edx, %edx + jz L(copy_1) + movw %r8w, (%rdi) + cmpl %ecx, %edx + ja L(zfill_from_1) + movzbl (%rsi, %rdx), %r8d +# ifdef USE_AS_STPCPY + movl %edx, %eax + adcq %rdi, %rax + movb %r8b, (%rdi, %rdx) + ret +# endif + +L(copy_1): +# ifdef USE_AS_STPCPY + movl %edx, %eax + cmpl %ecx, %edx + adcq %rdi, %rax +# endif +# ifdef USE_AS_WCSCPY + vmovd %VMM_128(0), (%rdi) +# else + movb %r8b, (%rdi, %rdx) +# endif + ret +# endif + + +# ifndef USE_AS_WCSCPY + .p2align 4,, 8 +L(zfill_from_1): +# ifdef USE_AS_STPCPY + leaq (%rdi, %rcx), %rax +# endif + movw $0, -1(%rdi, %rdx) + ret +# endif + + .p2align 4,, 4 +L(zero_len): + incq %rdx + jne L(best_effort_strncpy) + movq %rdi, %rax + ret +# endif + + + .p2align 4,, 4 + .p2align 6,, 8 +L(page_cross): + movq %rsi, %rax + andq $(VEC_SIZE * -1), %rax + VPCMPEQ (%rax), %VZERO, %k0 + KMOV %k0, %VRCX +# ifdef USE_AS_WCSCPY + movl %esi, %r8d + shrl $2, %r8d + andl $(CHAR_PER_VEC - 1), %r8d + shrx %VR8, %VRCX, %VRCX +# else + shrx %VRSI, %VRCX, %VRCX +# endif + + /* Compute amount of bytes we checked. */ + subl %esi, %eax + andl $(VEC_SIZE - 1), %eax +# ifdef USE_AS_WCSCPY + shrl $2, %eax +# endif + + /* If rax > rdx then we are finishing the copy at the end of the + page. */ + cmpq %rax, %rdx + jb L(page_cross_small) + + + /* If rcx is non-zero then continue. */ + test %VRCX, %VRCX + jz L(page_cross_continue) + + /* We found zero-CHAR so need to copy then zfill (we know we + didn't cover all of length here). */ + bsf %VRCX, %VRCX +L(movsb_and_zfill): + incl %ecx + subq %rcx, %rdx +# ifdef USE_AS_STPCPY + leaq -CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rax +# else + movq %rdi, %rax +# endif + + REP_MOVS +# ifdef USE_AS_WCSCPY + movl $0, (%rdi) +# else + movb $0, (%rdi) +# endif + jmp L(zfill_from_page_cross) + +L(page_cross_small): + tzcnt %VRCX, %VRCX + cmpl %ecx, %edx + jbe L(page_cross_copy_only) + + /* Do a zfill of the tail before copying. */ + movq %rdi, %r9 + xorl %eax, %eax + + movl %ecx, %r8d + + subl %ecx, %edx + leaq CHAR_SIZE(%rdi, %rcx, CHAR_SIZE), %rdi + movl %edx, %ecx + REP_STOS + movq %r9, %rdi + movl %r8d, %edx +L(page_cross_copy_only): + leal 1(%rdx), %ecx +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + adcl $0, %edx + leaq (%rdi, %rdx, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# else + movq %rdi, %rax +# endif + REP_MOVS + ret + + +L(best_effort_strncpy): + movq %rdx, %rcx + xorl %eax, %eax + movq %rdi, %r8 + /* The length is >= 2^63. We very much so expect to segfault at + rep stos. If that doesn't happen then just strcpy to finish. + */ + REP_STOS + movq %r8, %rdi + jmp OVERFLOW_STRCPY +END(STRNCPY) +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h new file mode 100644 index 0000000000..d5ff4cbe50 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h @@ -0,0 +1,65 @@ +#ifndef _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ +#define _STRNCPY_OR_CAT_OVERFLOW_DEF_H_ 1 + +#if defined USE_MULTIARCH && IS_IN(libc) +# define UNDERSCORES __ +# ifdef USE_WITH_SSE2 +# define ISA_EXT _sse2 +# elif defined USE_WITH_AVX +# ifdef USE_WITH_RTM +# define ISA_EXT _avx_rtm +# else +# define ISA_EXT _avx +# endif +# elif defined USE_WITH_AVX2 +# ifdef USE_WITH_RTM +# define ISA_EXT _avx2_rtm +# else +# define ISA_EXT _avx2 +# endif + +# elif defined USE_WITH_EVEX256 +# define ISA_EXT _evex +# elif defined USE_WITH_EVEX512 +# define ISA_EXT _evex512 +# endif +#else +# define UNDERSCORES +# define ISA_EXT +#endif + +#ifdef USE_AS_WCSCPY +# define STRCPY_PREFIX wc +# define STRCAT_PREFIX wcs +# ifdef USE_AS_STPCPY +# define STRCPY_POSTFIX pcpy +# else +# define STRCPY_POSTFIX scpy +# endif +#else +# define STRCPY_PREFIX st +# define STRCAT_PREFIX str +# ifdef USE_AS_STPCPY +# define STRCPY_POSTFIX pcpy +# else +# define STRCPY_POSTFIX rcpy +# endif +#endif +#define STRCAT_POSTFIX cat + +#define PRIMITIVE_OF_NAMER(underscores, prefix, postfix, ext) \ + underscores##prefix##postfix##ext + +#define OF_NAMER(...) PRIMITIVE_OF_NAMER (__VA_ARGS__) + +#ifndef OVERFLOW_STRCPY +# define OVERFLOW_STRCPY \ + OF_NAMER (UNDERSCORES, STRCPY_PREFIX, STRCPY_POSTFIX, ISA_EXT) +#endif + +#ifndef OVERFLOW_STRCAT +# define OVERFLOW_STRCAT \ + OF_NAMER (UNDERSCORES, STRCAT_PREFIX, STRCAT_POSTFIX, ISA_EXT) +#endif + +#endif