Message ID | 20221109013841.3707572-2-goldstein.w.n@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v5,1/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions | expand |
On Tue, Nov 08, 2022 at 05:38:39PM -0800, Noah Goldstein wrote: > Optimizations are: > 1. Use more overlapping stores to avoid branches. > 2. Reduce how unrolled the aligning copies are (this is more of a > code-size save, its a negative for some sizes in terms of > perf). > 3. For st{r|p}n{cat|cpy} re-order the branches to minimize the > number that are taken. > > Performance Changes: > > Times are from N = 10 runs of the benchmark suite and are > reported as geometric mean of all ratios of > New Implementation / Old Implementation. > > strcat-avx2 -> 0.998 > strcpy-avx2 -> 0.937 > stpcpy-avx2 -> 0.971 > > strncpy-avx2 -> 0.793 > stpncpy-avx2 -> 0.775 > > strncat-avx2 -> 0.962 > > Code Size Changes: > function -> Bytes New / Bytes Old -> Ratio > > strcat-avx2 -> 685 / 1639 -> 0.418 > strcpy-avx2 -> 560 / 903 -> 0.620 > stpcpy-avx2 -> 592 / 939 -> 0.630 > > strncpy-avx2 -> 1176 / 2390 -> 0.492 > stpncpy-avx2 -> 1268 / 2438 -> 0.520 > > strncat-avx2 -> 1042 / 2563 -> 0.407 > > Notes: > 1. Because of the significant difference between the > implementations they are split into three files. > > strcpy-avx2.S -> strcpy, stpcpy, strcat > strncpy-avx2.S -> strncpy > strncat-avx2.S > strncat > > I couldn't find a way to merge them without making the > ifdefs incredibly difficult to follow. > > Full check passes on x86-64 and build succeeds for all ISA levels w/ > and w/o multiarch. > --- > sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 6 +- > sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 7 +- > sysdeps/x86_64/multiarch/stpncpy-avx2.S | 5 +- > sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 13 +- > sysdeps/x86_64/multiarch/strcat-avx2.S | 268 +--- > .../x86_64/multiarch/strcat-strlen-avx2.h.S | 101 ++ > sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 13 +- > sysdeps/x86_64/multiarch/strcpy-avx2.S | 1236 +++++------------ > sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 6 +- > sysdeps/x86_64/multiarch/strncat-avx2.S | 424 +++++- > sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 6 +- > sysdeps/x86_64/multiarch/strncpy-avx2.S | 740 +++++++++- > sysdeps/x86_64/multiarch/x86-avx-vecs.h | 3 +- > 13 files changed, 1594 insertions(+), 1234 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S > > diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S > index 2b9c07a59f..90e532dbe8 100644 > --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S > @@ -1,3 +1,3 @@ > -#define USE_AS_STPCPY > -#define STRCPY __stpcpy_avx2_rtm > -#include "strcpy-avx2-rtm.S" > +#define STPCPY __stpcpy_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > +#include "stpcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S > index 60a2ccfe53..46ee07be36 100644 > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S > @@ -1,4 +1,3 @@ > -#define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY __stpncpy_avx2_rtm > -#include "strcpy-avx2-rtm.S" > +#define STPNCPY __stpncpy_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > +#include "stpncpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S > index b2f8c19143..a46a8edbe2 100644 > --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S > +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S > @@ -3,6 +3,5 @@ > #endif > > #define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY STPNCPY > -#include "strcpy-avx2.S" > +#define STRNCPY STPNCPY > +#include "strncpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S > index 637fb557c4..e84f4f1fef 100644 > --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S > @@ -1,12 +1,3 @@ > -#ifndef STRCAT > -# define STRCAT __strcat_avx2_rtm > -#endif > - > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > - > -#define VZEROUPPER_RETURN jmp L(return_vzeroupper) > - > -#define SECTION(p) p##.avx.rtm > - > +#define STRCAT __strcat_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > #include "strcat-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S > index d9b7fb2a43..3f914fa342 100644 > --- a/sysdeps/x86_64/multiarch/strcat-avx2.S > +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S > @@ -16,266 +16,10 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#include <isa-level.h> > - > -#if ISA_SHOULD_BUILD (3) > - > - > -# include <sysdep.h> > - > -# ifndef STRCAT > -# define STRCAT __strcat_avx2 > -# endif > - > -# define USE_AS_STRCAT > - > -/* Number of bytes in a vector register */ > -# define VEC_SIZE 32 > - > -# ifndef SECTION > -# define SECTION(p) p##.avx > -# endif > - > - .section SECTION(.text),"ax",@progbits > -ENTRY (STRCAT) > - mov %rdi, %r9 > -# ifdef USE_AS_STRNCAT > - mov %rdx, %r8 > -# endif > - > - xor %eax, %eax > - mov %edi, %ecx > - and $((VEC_SIZE * 4) - 1), %ecx > - vpxor %xmm6, %xmm6, %xmm6 > - cmp $(VEC_SIZE * 3), %ecx > - ja L(fourth_vector_boundary) > - vpcmpeqb (%rdi), %ymm6, %ymm0 > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_first_vector) > - mov %rdi, %rax > - and $-VEC_SIZE, %rax > - jmp L(align_vec_size_start) > -L(fourth_vector_boundary): > - mov %rdi, %rax > - and $-VEC_SIZE, %rax > - vpcmpeqb (%rax), %ymm6, %ymm0 > - mov $-1, %r10d > - sub %rax, %rcx > - shl %cl, %r10d > - vpmovmskb %ymm0, %edx > - and %r10d, %edx > - jnz L(exit) > - > -L(align_vec_size_start): > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 > - vpmovmskb %ymm3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 > - add $(VEC_SIZE * 4), %rax > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 > - vpmovmskb %ymm3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 > - add $(VEC_SIZE * 4), %rax > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 > - vpmovmskb %ymm3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 > - add $(VEC_SIZE * 4), %rax > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 > - vpmovmskb %ymm3, %edx > - test %edx, %edx > - jnz L(exit_null_on_fifth_vector) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 > - add $(VEC_SIZE * 5), %rax > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 > - add $VEC_SIZE, %rax > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 > - add $VEC_SIZE, %rax > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit) > - > - test $((VEC_SIZE * 4) - 1), %rax > - jz L(align_four_vec_loop) > - > - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 > - add $VEC_SIZE, %rax > - vpmovmskb %ymm3, %edx > - test %edx, %edx > - jnz L(exit) > - > - add $VEC_SIZE, %rax > - > - .p2align 4 > -L(align_four_vec_loop): > - vmovaps (%rax), %ymm4 > - vpminub VEC_SIZE(%rax), %ymm4, %ymm4 > - vmovaps (VEC_SIZE * 2)(%rax), %ymm5 > - vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 > - add $(VEC_SIZE * 4), %rax > - vpminub %ymm4, %ymm5, %ymm5 > - vpcmpeqb %ymm5, %ymm6, %ymm5 > - vpmovmskb %ymm5, %edx > - test %edx, %edx > - jz L(align_four_vec_loop) > - > - vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 > - sub $(VEC_SIZE * 5), %rax > - vpmovmskb %ymm0, %edx > - test %edx, %edx > - jnz L(exit_null_on_second_vector) > - > - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 > - vpmovmskb %ymm1, %edx > - test %edx, %edx > - jnz L(exit_null_on_third_vector) > - > - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 > - vpmovmskb %ymm2, %edx > - test %edx, %edx > - jnz L(exit_null_on_fourth_vector) > - > - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 > - vpmovmskb %ymm3, %edx > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 4), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit): > - sub %rdi, %rax > -L(exit_null_on_first_vector): > - bsf %rdx, %rdx > - add %rdx, %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_second_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $VEC_SIZE, %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_third_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 2), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_fourth_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 3), %rax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_null_on_fifth_vector): > - sub %rdi, %rax > - bsf %rdx, %rdx > - add %rdx, %rax > - add $(VEC_SIZE * 4), %rax > - > - .p2align 4 > -L(StartStrcpyPart): > - lea (%r9, %rax), %rdi > - mov %rsi, %rcx > - mov %r9, %rax /* save result */ > - > -# ifdef USE_AS_STRNCAT > - test %r8, %r8 > - jz L(ExitZero) > -# define USE_AS_STRNCPY > -# endif > - > -# include "strcpy-avx2.S" > +#ifndef STRCAT > +# define STRCAT __strcat_avx2 > #endif > + > +#define USE_AS_STRCAT > +#define STRCPY STRCAT > +#include "strcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S > new file mode 100644 > index 0000000000..f50514e07c > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S > @@ -0,0 +1,101 @@ > +/* strlen used for begining of str{n}cat using AVX2. > + Copyright (C) 2011-2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > + > +/* NOTE: This file is meant to be included by strcat-avx2 or > + strncat-avx2 and does not standalone. Before including %rdi > + must be saved in %rax. */ > + > + > +/* Simple strlen implementation that ends at > + L(strcat_strlen_done). */ > + movq %rdi, %r8 > + andq $(VEC_SIZE * -1), %r8 > + VPCMPEQ (%r8), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + shrxl %edi, %ecx, %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v0) > + > + VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + leaq (VEC_SIZE)(%r8), %rdi > + testl %ecx, %ecx > + jnz L(bsf_and_done_v0) > + > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v2) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v3) > + > + orq $(VEC_SIZE * 4 - 1), %rdi > + .p2align 4,, 8 > +L(loop_2x_vec): > + VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0) > + VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2) > + VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3) > + VPMIN %VMM(1), %VMM(3), %VMM(3) > + VPCMPEQ %VMM(3), %VZERO, %VMM(3) > + vpmovmskb %VMM(3), %r8d > + subq $(VEC_SIZE * -4), %rdi > + testl %r8d, %r8d > + jz L(loop_2x_vec) > + > + addq $(VEC_SIZE * -4 + 1), %rdi > + > + VPCMPEQ %VMM(0), %VZERO, %VMM(0) > + vpmovmskb %VMM(0), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v0) > + > + VPCMPEQ %VMM(1), %VZERO, %VMM(1) > + vpmovmskb %VMM(1), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v1) > + > + VPCMPEQ %VMM(2), %VZERO, %VMM(2) > + vpmovmskb %VMM(2), %ecx > + testl %ecx, %ecx > + jnz L(bsf_and_done_v2) > + > + movl %r8d, %ecx > +L(bsf_and_done_v3): > + addq $VEC_SIZE, %rdi > +L(bsf_and_done_v2): > + bsfl %ecx, %ecx > + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi > + jmp L(strcat_strlen_done) > + > + .p2align 4,, 4 > +L(bsf_and_done_v1): > + addq $VEC_SIZE, %rdi > +L(bsf_and_done_v0): > + bsfl %ecx, %ecx > + addq %rcx, %rdi > +L(strcat_strlen_done): > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S > index c2c581ecf7..3ae2de8ea9 100644 > --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S > @@ -1,12 +1,3 @@ > -#ifndef STRCPY > -# define STRCPY __strcpy_avx2_rtm > -#endif > - > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > - > -#define VZEROUPPER_RETURN jmp L(return_vzeroupper) > - > -#define SECTION(p) p##.avx.rtm > - > +#define STRCPY __strcpy_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > #include "strcpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S > index c725834929..32f86baa4c 100644 > --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S > +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S > @@ -20,984 +20,378 @@ > > #if ISA_SHOULD_BUILD (3) > > +# include <sysdep.h> > > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > - > -# ifndef STRCPY > -# define STRCPY __strcpy_avx2 > -# endif > - > -# endif > - > -/* Number of bytes in a vector register */ > # ifndef VEC_SIZE > -# define VEC_SIZE 32 > -# endif > - > -# ifndef VZEROUPPER > -# define VZEROUPPER vzeroupper > -# endif > - > -# ifndef SECTION > -# define SECTION(p) p##.avx > -# endif > - > -/* zero register */ > -#define xmmZ xmm0 > -#define ymmZ ymm0 > - > -/* mask register */ > -#define ymmM ymm1 > - > -# ifndef USE_AS_STRCAT > - > - .section SECTION(.text),"ax",@progbits > -ENTRY (STRCPY) > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > - test %R8_LP, %R8_LP > - jz L(ExitZero) > -# endif > - mov %rsi, %rcx > -# ifndef USE_AS_STPCPY > - mov %rdi, %rax /* save result */ > -# endif > - > +# include "x86-avx-vecs.h" > # endif > > - vpxor %xmmZ, %xmmZ, %xmmZ > - > - and $((VEC_SIZE * 4) - 1), %ecx > - cmp $(VEC_SIZE * 2), %ecx > - jbe L(SourceStringAlignmentLessTwoVecSize) > - > - and $-VEC_SIZE, %rsi > - and $(VEC_SIZE - 1), %ecx > - > - vpcmpeqb (%rsi), %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - shr %cl, %rdx > - > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - mov $VEC_SIZE, %r10 > - sub %rcx, %r10 > - cmp %r10, %r8 > -# else > - mov $(VEC_SIZE + 1), %r10 > - sub %rcx, %r10 > - cmp %r10, %r8 > -# endif > - jbe L(CopyVecSizeTailCase2OrCase3) > +# ifndef STRCPY > +# define STRCPY __strcpy_avx2 > # endif > - test %edx, %edx > - jnz L(CopyVecSizeTail) > > - vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 > - vpmovmskb %ymm2, %edx > + /* Use movsb in page cross case to save code size. */ > +# define USE_MOVSB_IN_PAGE_CROSS 1 > > -# ifdef USE_AS_STRNCPY > - add $VEC_SIZE, %r10 > - cmp %r10, %r8 > - jbe L(CopyTwoVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyTwoVecSize) > - > - vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ > - vmovdqu %ymm2, (%rdi) > - > -/* If source address alignment != destination address alignment */ > - .p2align 4 > -L(UnalignVecSizeBoth): > - sub %rcx, %rdi > -# ifdef USE_AS_STRNCPY > - add %rcx, %r8 > - sbb %rcx, %rcx > - or %rcx, %r8 > -# endif > - mov $VEC_SIZE, %rcx > - vmovdqa (%rsi, %rcx), %ymm2 > - vmovdqu %ymm2, (%rdi, %rcx) > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 > - vpcmpeqb %ymm2, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 3), %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > +# ifdef USE_AS_WCSCPY > +# define VPCMPEQ vpcmpeqd > +# define VPMIN vpminud > +# define CHAR_SIZE 4 > # else > - jnz L(CopyVecSize) > +# define VPCMPEQ vpcmpeqb > +# define VPMIN vpminub > +# define CHAR_SIZE 1 > # endif > > - vmovdqu %ymm2, (%rdi, %rcx) > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 > - vpcmpeqb %ymm3, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec3) > -# else > - jnz L(CopyVecSize) > -# endif > +# define PAGE_SIZE 4096 > > - vmovdqu %ymm3, (%rdi, %rcx) > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 > - vpcmpeqb %ymm4, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec4) > +# ifdef USE_AS_STPCPY > +# define END_REG rax > # else > - jnz L(CopyVecSize) > +# define END_REG rdi, %rdx > # endif > > - vmovdqu %ymm4, (%rdi, %rcx) > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 > - vpcmpeqb %ymm2, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > +# ifdef USE_AS_STRCAT > +# define PAGE_ALIGN_REG ecx > # else > - jnz L(CopyVecSize) > +# define PAGE_ALIGN_REG eax > # endif > > - vmovdqu %ymm2, (%rdi, %rcx) > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 > - vpcmpeqb %ymm2, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec2) > -# else > - jnz L(CopyVecSize) > -# endif > +# define VZERO VMM(7) > +# define VZERO_128 VMM_128(7) > > - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 > - vmovdqu %ymm2, (%rdi, %rcx) > - vpcmpeqb %ymm3, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $VEC_SIZE, %rcx > -# ifdef USE_AS_STRNCPY > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > -# endif > - test %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec3) > -# else > - jnz L(CopyVecSize) > -# endif > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRCPY) > + vpxor %VZERO_128, %VZERO_128, %VZERO_128 > > - vmovdqu %ymm3, (%rdi, %rcx) > - mov %rsi, %rdx > - lea VEC_SIZE(%rsi, %rcx), %rsi > - and $-(VEC_SIZE * 4), %rsi > - sub %rsi, %rdx > - sub %rdx, %rdi > -# ifdef USE_AS_STRNCPY > - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 > -# endif > -L(UnalignedFourVecSizeLoop): > - vmovdqa (%rsi), %ymm4 > - vmovdqa VEC_SIZE(%rsi), %ymm5 > - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 > - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 > - vpminub %ymm5, %ymm4, %ymm2 > - vpminub %ymm7, %ymm6, %ymm3 > - vpminub %ymm2, %ymm3, %ymm3 > - vpcmpeqb %ymmM, %ymm3, %ymm3 > - vpmovmskb %ymm3, %edx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 4), %r8 > - jbe L(UnalignedLeaveCase2OrCase3) > -# endif > - test %edx, %edx > - jnz L(UnalignedFourVecSizeLeave) > - > -L(UnalignedFourVecSizeLoop_start): > - add $(VEC_SIZE * 4), %rdi > - add $(VEC_SIZE * 4), %rsi > - vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) > - vmovdqa (%rsi), %ymm4 > - vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) > - vmovdqa VEC_SIZE(%rsi), %ymm5 > - vpminub %ymm5, %ymm4, %ymm2 > - vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) > - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 > - vmovdqu %ymm7, -VEC_SIZE(%rdi) > - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 > - vpminub %ymm7, %ymm6, %ymm3 > - vpminub %ymm2, %ymm3, %ymm3 > - vpcmpeqb %ymmM, %ymm3, %ymm3 > - vpmovmskb %ymm3, %edx > -# ifdef USE_AS_STRNCPY > - sub $(VEC_SIZE * 4), %r8 > - jbe L(UnalignedLeaveCase2OrCase3) > -# endif > - test %edx, %edx > - jz L(UnalignedFourVecSizeLoop_start) > - > -L(UnalignedFourVecSizeLeave): > - vpcmpeqb %ymm4, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - test %edx, %edx > - jnz L(CopyVecSizeUnaligned_0) > - > - vpcmpeqb %ymm5, %ymmZ, %ymmM > - vpmovmskb %ymmM, %ecx > - test %ecx, %ecx > - jnz L(CopyVecSizeUnaligned_16) > - > - vpcmpeqb %ymm6, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - test %edx, %edx > - jnz L(CopyVecSizeUnaligned_32) > - > - vpcmpeqb %ymm7, %ymmZ, %ymmM > - vpmovmskb %ymmM, %ecx > - bsf %ecx, %edx > - vmovdqu %ymm4, (%rdi) > - vmovdqu %ymm5, VEC_SIZE(%rdi) > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax > -# endif > - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) > - add $(VEC_SIZE - 1), %r8 > - sub %rdx, %r8 > - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - add $(VEC_SIZE * 3), %rsi > - add $(VEC_SIZE * 3), %rdi > - jmp L(CopyVecSizeExit) > +# ifdef USE_AS_STRCAT > + movq %rdi, %rax > +# include "strcat-strlen-avx2.h.S" > # endif > > -/* If source address alignment == destination address alignment */ > - > -L(SourceStringAlignmentLessTwoVecSize): > - vmovdqu (%rsi), %ymm3 > - vmovdqu VEC_SIZE(%rsi), %ymm2 > - vpcmpeqb %ymm3, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - cmp $VEC_SIZE, %r8 > -# else > - cmp $(VEC_SIZE + 1), %r8 > -# endif > - jbe L(CopyVecSizeTail1Case2OrCase3) > + movl %esi, %PAGE_ALIGN_REG > + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG > + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG > + ja L(page_cross) > +L(page_cross_continue): > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + movq %rdi, %rax > # endif > - test %edx, %edx > - jnz L(CopyVecSizeTail1) > - > - vmovdqu %ymm3, (%rdi) > - vpcmpeqb %ymm2, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - > -# ifdef USE_AS_STRNCPY > -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > - cmp $(VEC_SIZE * 2), %r8 > -# else > - cmp $((VEC_SIZE * 2) + 1), %r8 > -# endif > - jbe L(CopyTwoVecSize1Case2OrCase3) > -# endif > - test %edx, %edx > - jnz L(CopyTwoVecSize1) > - > - and $-VEC_SIZE, %rsi > - and $(VEC_SIZE - 1), %ecx > - jmp L(UnalignVecSizeBoth) > + VMOVU (%rsi), %VMM(0) > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > > -/*------End of main part with loops---------------------*/ > + testl %ecx, %ecx > + jz L(more_1x_vec) > > -/* Case1 */ > + /* No longer need ymm registers so just vzeroupper so it doesn't > + need to be duplicated at each return statement. */ > + COND_VZEROUPPER > > -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) > - .p2align 4 > -L(CopyVecSize): > - add %rcx, %rdi > -# endif > -L(CopyVecSizeTail): > - add %rcx, %rsi > -L(CopyVecSizeTail1): > - bsf %edx, %edx > -L(CopyVecSizeExit): > - cmp $32, %edx > - jae L(Exit32_63) > - cmp $16, %edx > - jae L(Exit16_31) > - cmp $8, %edx > - jae L(Exit8_15) > - cmp $4, %edx > - jae L(Exit4_7) > - cmp $3, %edx > - je L(Exit3) > - cmp $1, %edx > - ja L(Exit2) > - je L(Exit1) > - movb $0, (%rdi) > + xorl %edx, %edx > + bsfl %ecx, %edx > # ifdef USE_AS_STPCPY > - lea (%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $1, %r8 > - lea 1(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > -# endif > -L(return_vzeroupper): > - ZERO_UPPER_VEC_REGISTERS_RETURN > - > - .p2align 4 > -L(CopyTwoVecSize1): > - add $VEC_SIZE, %rsi > - add $VEC_SIZE, %rdi > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $VEC_SIZE, %r8 > -# endif > - jmp L(CopyVecSizeTail1) > - > - .p2align 4 > -L(CopyTwoVecSize): > - bsf %edx, %edx > - add %rcx, %rsi > - add $VEC_SIZE, %edx > - sub %ecx, %edx > - jmp L(CopyVecSizeExit) > - > - .p2align 4 > -L(CopyVecSizeUnaligned_0): > - bsf %edx, %edx > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > - vmovdqu %ymm4, (%rdi) > - add $((VEC_SIZE * 4) - 1), %r8 > - sub %rdx, %r8 > - lea 1(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > -# else > - jmp L(CopyVecSizeExit) > -# endif > - > - .p2align 4 > -L(CopyVecSizeUnaligned_16): > - bsf %ecx, %edx > - vmovdqu %ymm4, (%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea VEC_SIZE(%rdi, %rdx), %rax > -# endif > - vmovdqu %ymm5, VEC_SIZE(%rdi) > - add $((VEC_SIZE * 3) - 1), %r8 > - sub %rdx, %r8 > - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > + leaq (%rdi, %rdx), %rax > +# endif > + > + /* Use mask bits in rcx to detect which copy we need. If the low > + mask is zero then there must be a bit set in the upper half. > + I.e if ecx != 0 and cx == 0, then match must be upper 16 > + bits so we use L(copy_16_31). */ > + testw %cx, %cx > + jz L(copy_16_31) > + > + testb %cl, %cl > + jz L(copy_8_15) > +# ifdef USE_AS_WCSCPY > + vmovd %xmm0, (%rdi) > + movl $0, (%END_REG) > + ret > # else > - add $VEC_SIZE, %rsi > - add $VEC_SIZE, %rdi > - jmp L(CopyVecSizeExit) > -# endif > - > - .p2align 4 > -L(CopyVecSizeUnaligned_32): > - bsf %edx, %edx > - vmovdqu %ymm4, (%rdi) > - vmovdqu %ymm5, VEC_SIZE(%rdi) > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > -# ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax > -# endif > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) > - add $((VEC_SIZE * 2) - 1), %r8 > - sub %rdx, %r8 > - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi > - jmp L(StrncpyFillTailWithZero) > + testb $0x7, %cl > + jz L(copy_4_7) > + > + testl %edx, %edx > + jz L(set_null_term) > + vmovd %xmm0, %ecx > + movw %cx, (%rdi) > + > + .p2align 4,, 2 > +L(set_null_term): > + movb $0, (%END_REG) > + ret > + > + .p2align 4,, 12 > +L(copy_4_7): > + movl -3(%rsi, %rdx), %ecx > + vmovd %xmm0, (%rdi) > + movl %ecx, -3(%END_REG) > + ret > +# endif > + > + .p2align 4,, 10 > +L(copy_16_31): > + VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 > + VMOVU %xmm0, (%rdi) > + VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG) > + ret > + > + .p2align 4,, 10 > +L(copy_8_15): > +# ifdef USE_AS_WCSCPY > + movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx > # else > - add $(VEC_SIZE * 2), %rsi > - add $(VEC_SIZE * 2), %rdi > - jmp L(CopyVecSizeExit) > -# endif > - > -# ifdef USE_AS_STRNCPY > -# ifndef USE_AS_STRCAT > - .p2align 4 > -L(CopyVecSizeUnalignedVec6): > - vmovdqu %ymm6, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec5): > - vmovdqu %ymm5, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec4): > - vmovdqu %ymm4, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec3): > - vmovdqu %ymm3, (%rdi, %rcx) > - jmp L(CopyVecSizeVecExit) > -# endif > - > -/* Case2 */ > - > - .p2align 4 > -L(CopyVecSizeCase2): > - add $VEC_SIZE, %r8 > - add %rcx, %rdi > - add %rcx, %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSizeCase2): > - add %rcx, %rsi > - bsf %edx, %edx > - add $VEC_SIZE, %edx > - sub %ecx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -L(CopyVecSizeTailCase2): > - add %rcx, %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -L(CopyVecSizeTail1Case2): > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > - jmp L(StrncpyExit) > - > -/* Case2 or Case3, Case3 */ > - > - .p2align 4 > -L(CopyVecSizeCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeCase2) > -L(CopyVecSizeCase3): > - add $VEC_SIZE, %r8 > - add %rcx, %rdi > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSizeCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyTwoVecSizeCase2) > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyVecSizeTailCase2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeTailCase2) > - add %rcx, %rsi > - jmp L(StrncpyExit) > - > - .p2align 4 > -L(CopyTwoVecSize1Case2OrCase3): > - add $VEC_SIZE, %rdi > - add $VEC_SIZE, %rsi > - sub $VEC_SIZE, %r8 > -L(CopyVecSizeTail1Case2OrCase3): > - test %rdx, %rdx > - jnz L(CopyVecSizeTail1Case2) > - jmp L(StrncpyExit) > -# endif > - > -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ > - > - .p2align 4 > -L(Exit1): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > -# ifdef USE_AS_STPCPY > - lea 1(%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $2, %r8 > - lea 2(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Exit2): > - movzwl (%rsi), %ecx > - mov %cx, (%rdi) > - movb $0, 2(%rdi) > -# ifdef USE_AS_STPCPY > - lea 2(%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $3, %r8 > - lea 3(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Exit3): > - mov (%rsi), %edx > - mov %edx, (%rdi) > + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx > +# endif > + vmovq %xmm0, (%rdi) > + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) > + ret > + > + > + .p2align 4,, 8 > +L(more_1x_vec): > +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT > + VMOVU %VMM(0), (%rdi) > +# endif > + subq %rsi, %rdi > + orq $(VEC_SIZE - 1), %rsi > + addq %rsi, %rdi > + VMOVA 1(%rsi), %VMM(1) > + > + /* Try and order stores after as many loads as is reasonable to > + avoid potential false dependencies. */ > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + VMOVU %VMM(0), (%rax) > +# endif > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1) > + > + VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2) > + VMOVU %VMM(1), 1(%rdi) > + > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3) > + VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi) > + > + VPCMPEQ %VMM(3), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi) > + VPCMPEQ %VMM(4), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %edx > + testl %edx, %edx > + jnz L(ret_vec_x4) > + > + VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi) > + > + /* Subtract rsi from rdi before aligning. Adding back rsi will > + get proper rdi (dst) for new src. */ > + subq %rsi, %rdi > + incq %rsi > + orq $(VEC_SIZE * 4 - 1), %rsi > + > + /* Do first half of loop ahead of time so loop can just start by > + storing. */ > + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %edx > + addq %rsi, %rdi > + > + testl %edx, %edx > + jnz L(loop_4x_done) > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + > + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) > + subq $(VEC_SIZE * -4), %rsi > + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi) > + > + > + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + > + vpmovmskb %VMM(6), %edx > + subq $(VEC_SIZE * -4), %rdi > + testl %edx, %edx > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1) > + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) > + > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) > + > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x3) > + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) > +L(ret_vec_x4): > + bsfl %edx, %edx > + VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) > + VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > # ifdef USE_AS_STPCPY > - lea 3(%rdi), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub $4, %r8 > - lea 4(%rdi), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax > # endif > +L(return_end): > VZEROUPPER_RETURN > > - .p2align 4 > -L(Exit4_7): > - mov (%rsi), %ecx > - mov %ecx, (%rdi) > - mov -3(%rsi, %rdx), %ecx > - mov %ecx, -3(%rdi, %rdx) > + .p2align 4,, 8 > +L(ret_vec_x1): > + bsfl %ecx, %ecx > + VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) > + VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq 1(%rcx, %rdi), %rax > # endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Exit8_15): > - mov (%rsi), %rcx > - mov -7(%rsi, %rdx), %r9 > - mov %rcx, (%rdi) > - mov %r9, -7(%rdi, %rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > -# endif > - VZEROUPPER_RETURN > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > > - .p2align 4 > -L(Exit16_31): > - vmovdqu (%rsi), %xmm2 > - vmovdqu -15(%rsi, %rdx), %xmm3 > - vmovdqu %xmm2, (%rdi) > - vmovdqu %xmm3, -15(%rdi, %rdx) > + .p2align 4,, 8 > +L(ret_vec_x2): > + bsfl %ecx, %ecx > + VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) > + VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax > # endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(Exit32_63): > - vmovdqu (%rsi), %ymm2 > - vmovdqu -31(%rsi, %rdx), %ymm3 > - vmovdqu %ymm2, (%rdi) > - vmovdqu %ymm3, -31(%rdi, %rdx) > + .p2align 4,, 8 > +L(ret_vec_x3): > + bsfl %ecx, %ecx > + VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) > + VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) > # ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT > - sub %rdx, %r8 > - sub $1, %r8 > - lea 1(%rdi, %rdx), %rdi > - jnz L(StrncpyFillTailWithZero) > + leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax > # endif > VZEROUPPER_RETURN > > -# ifdef USE_AS_STRNCPY > > - .p2align 4 > -L(StrncpyExit1): > - movzbl (%rsi), %edx > - mov %dl, (%rdi) > + .p2align 4,, 4 > +L(page_cross): > + movq %rsi, %rcx > + andq $(VEC_SIZE * -1), %rcx > + > + VPCMPEQ (%rcx), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + shrxl %esi, %ecx, %ecx > +# if USE_MOVSB_IN_PAGE_CROSS > + /* Optimizing more aggressively for space as this is very cold > + code. This saves 2x cache lines. */ > + > + /* This adds once to the later result which will get correct > + copy bounds. NB: this can never zero-out a non-zero RCX as > + to be in the page cross case rsi cannot be aligned and we > + already right-shift rcx by the misalignment. */ > + shll $CHAR_SIZE, %ecx > + jz L(page_cross_continue) > + bsfl %ecx, %ecx > +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT > + movq %rdi, %rax > +# endif > + rep movsb > # ifdef USE_AS_STPCPY > - lea 1(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 1(%rdi) > + leaq -CHAR_SIZE(%rdi), %rax > # endif > - VZEROUPPER_RETURN > > - .p2align 4 > -L(StrncpyExit2): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > -# ifdef USE_AS_STPCPY > - lea 2(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 2(%rdi) > -# endif > VZEROUPPER_RETURN > > - .p2align 4 > -L(StrncpyExit3_4): > - movzwl (%rsi), %ecx > - movzwl -2(%rsi, %r8), %edx > - mov %cx, (%rdi) > - mov %dx, -2(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(StrncpyExit5_8): > - mov (%rsi), %ecx > - mov -4(%rsi, %r8), %edx > - mov %ecx, (%rdi) > - mov %edx, -4(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(StrncpyExit9_16): > - mov (%rsi), %rcx > - mov -8(%rsi, %r8), %rdx > - mov %rcx, (%rdi) > - mov %rdx, -8(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(StrncpyExit17_32): > - vmovdqu (%rsi), %xmm2 > - vmovdqu -16(%rsi, %r8), %xmm3 > - vmovdqu %xmm2, (%rdi) > - vmovdqu %xmm3, -16(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(StrncpyExit33_64): > - /* 0/32, 31/16 */ > - vmovdqu (%rsi), %ymm2 > - vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 > - vmovdqu %ymm2, (%rdi) > - vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) > -# ifdef USE_AS_STPCPY > - lea (%rdi, %r8), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi, %r8) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(StrncpyExit65): > - /* 0/32, 32/32, 64/1 */ > - vmovdqu (%rsi), %ymm2 > - vmovdqu 32(%rsi), %ymm3 > - mov 64(%rsi), %cl > - vmovdqu %ymm2, (%rdi) > - vmovdqu %ymm3, 32(%rdi) > - mov %cl, 64(%rdi) > -# ifdef USE_AS_STPCPY > - lea 65(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, 65(%rdi) > -# endif > - VZEROUPPER_RETURN > +# else > + testl %ecx, %ecx > + jz L(page_cross_continue) > > + /* Traditional copy case, essentially same as used in non-page- > + cross case but since we can't reuse VMM(0) we need twice as > + many loads from rsi. */ > # ifndef USE_AS_STRCAT > - > - .p2align 4 > -L(Fill1): > - mov %dl, (%rdi) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Fill2): > - mov %dx, (%rdi) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Fill3_4): > - mov %dx, (%rdi) > - mov %dx, -2(%rdi, %r8) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Fill5_8): > - mov %edx, (%rdi) > - mov %edx, -4(%rdi, %r8) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Fill9_16): > - mov %rdx, (%rdi) > - mov %rdx, -8(%rdi, %r8) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(Fill17_32): > - vmovdqu %xmmZ, (%rdi) > - vmovdqu %xmmZ, -16(%rdi, %r8) > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(CopyVecSizeUnalignedVec2): > - vmovdqu %ymm2, (%rdi, %rcx) > - > - .p2align 4 > -L(CopyVecSizeVecExit): > - bsf %edx, %edx > - add $(VEC_SIZE - 1), %r8 > - add %rcx, %rdi > -# ifdef USE_AS_STPCPY > - lea (%rdi, %rdx), %rax > -# endif > - sub %rdx, %r8 > - lea 1(%rdi, %rdx), %rdi > - > - .p2align 4 > -L(StrncpyFillTailWithZero): > - xor %edx, %edx > - sub $VEC_SIZE, %r8 > - jbe L(StrncpyFillExit) > - > - vmovdqu %ymmZ, (%rdi) > - add $VEC_SIZE, %rdi > - > - mov %rdi, %rsi > - and $(VEC_SIZE - 1), %esi > - sub %rsi, %rdi > - add %rsi, %r8 > - sub $(VEC_SIZE * 4), %r8 > - jb L(StrncpyFillLessFourVecSize) > - > -L(StrncpyFillLoopVmovdqa): > - vmovdqa %ymmZ, (%rdi) > - vmovdqa %ymmZ, VEC_SIZE(%rdi) > - vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) > - vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) > - add $(VEC_SIZE * 4), %rdi > - sub $(VEC_SIZE * 4), %r8 > - jae L(StrncpyFillLoopVmovdqa) > - > -L(StrncpyFillLessFourVecSize): > - add $(VEC_SIZE * 2), %r8 > - jl L(StrncpyFillLessTwoVecSize) > - vmovdqa %ymmZ, (%rdi) > - vmovdqa %ymmZ, VEC_SIZE(%rdi) > - add $(VEC_SIZE * 2), %rdi > - sub $VEC_SIZE, %r8 > - jl L(StrncpyFillExit) > - vmovdqa %ymmZ, (%rdi) > - add $VEC_SIZE, %rdi > - jmp L(Fill) > - > - .p2align 4 > -L(StrncpyFillLessTwoVecSize): > - add $VEC_SIZE, %r8 > - jl L(StrncpyFillExit) > - vmovdqa %ymmZ, (%rdi) > - add $VEC_SIZE, %rdi > - jmp L(Fill) > - > - .p2align 4 > -L(StrncpyFillExit): > - add $VEC_SIZE, %r8 > -L(Fill): > - cmp $17, %r8d > - jae L(Fill17_32) > - cmp $9, %r8d > - jae L(Fill9_16) > - cmp $5, %r8d > - jae L(Fill5_8) > - cmp $3, %r8d > - jae L(Fill3_4) > - cmp $1, %r8d > - ja L(Fill2) > - je L(Fill1) > - VZEROUPPER_RETURN > - > -/* end of ifndef USE_AS_STRCAT */ > + xorl %edx, %edx > # endif > - > - .p2align 4 > -L(UnalignedLeaveCase2OrCase3): > - test %rdx, %rdx > - jnz L(UnalignedFourVecSizeLeaveCase2) > -L(UnalignedFourVecSizeLeaveCase3): > - lea (VEC_SIZE * 4)(%r8), %rcx > - and $-VEC_SIZE, %rcx > - add $(VEC_SIZE * 3), %r8 > - jl L(CopyVecSizeCase3) > - vmovdqu %ymm4, (%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - vmovdqu %ymm5, VEC_SIZE(%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) > - sub $VEC_SIZE, %r8 > - jb L(CopyVecSizeCase3) > - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) > + bsfl %ecx, %edx > # ifdef USE_AS_STPCPY > - lea (VEC_SIZE * 4)(%rdi), %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (VEC_SIZE * 4)(%rdi) > + leaq (%rdi, %rdx), %rax > +# elif !defined USE_AS_STRCAT > + movq %rdi, %rax > # endif > - VZEROUPPER_RETURN > > - .p2align 4 > -L(UnalignedFourVecSizeLeaveCase2): > - xor %ecx, %ecx > - vpcmpeqb %ymm4, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - add $(VEC_SIZE * 3), %r8 > - jle L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec4) > -# else > - jnz L(CopyVecSize) > -# endif > - vpcmpeqb %ymm5, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - vmovdqu %ymm4, (%rdi) > - add $VEC_SIZE, %rcx > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec5) > -# else > - jnz L(CopyVecSize) > -# endif > + /* vzeroupper early to avoid duplicating at each return. */ > + COND_VZEROUPPER > > - vpcmpeqb %ymm6, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - vmovdqu %ymm5, VEC_SIZE(%rdi) > - add $VEC_SIZE, %rcx > - sub $VEC_SIZE, %r8 > - jbe L(CopyVecSizeCase2OrCase3) > - test %edx, %edx > -# ifndef USE_AS_STRCAT > - jnz L(CopyVecSizeUnalignedVec6) > -# else > - jnz L(CopyVecSize) > -# endif > + testw %cx, %cx > + jz L(page_cross_copy_16_31) > > - vpcmpeqb %ymm7, %ymmZ, %ymmM > - vpmovmskb %ymmM, %edx > - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) > - lea VEC_SIZE(%rdi, %rcx), %rdi > - lea VEC_SIZE(%rsi, %rcx), %rsi > - bsf %edx, %edx > - cmp %r8d, %edx > - jb L(CopyVecSizeExit) > -L(StrncpyExit): > - cmp $65, %r8d > - je L(StrncpyExit65) > - cmp $33, %r8d > - jae L(StrncpyExit33_64) > - cmp $17, %r8d > - jae L(StrncpyExit17_32) > - cmp $9, %r8d > - jae L(StrncpyExit9_16) > - cmp $5, %r8d > - jae L(StrncpyExit5_8) > - cmp $3, %r8d > - jae L(StrncpyExit3_4) > - cmp $1, %r8d > - ja L(StrncpyExit2) > - je L(StrncpyExit1) > -# ifdef USE_AS_STPCPY > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRCAT > - movb $0, (%rdi) > -# endif > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(ExitZero): > -# ifndef USE_AS_STRCAT > - mov %rdi, %rax > -# endif > - VZEROUPPER_RETURN > + testb %cl, %cl > + jz L(page_cross_copy_8_15) > > -# endif > + testl $0x7, %cl > + jz L(page_cross_copy_4_7) > > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# else > -END (STRCAT) > -# endif > + testl %edx, %edx > + jz L(page_cross_set_null_term) > + movzwl (%rsi), %ecx > + movw %cx, (%rdi) > +L(page_cross_set_null_term): > + movb $0, (%END_REG) > + ret > + > + .p2align 4,, 4 > +L(page_cross_copy_4_7): > + movl (%rsi), %ecx > + movl -3(%rsi, %rdx), %esi > + movl %ecx, (%rdi) > + movl %esi, -3(%END_REG) > + ret > + > + .p2align 4,, 4 > +L(page_cross_copy_8_15): > + movq (%rsi), %rcx > + movq -7(%rsi, %rdx), %rsi > + movq %rcx, (%rdi) > + movq %rsi, -7(%END_REG) > + ret > + > + > + .p2align 4,, 3 > +L(page_cross_copy_16_31): > + VMOVU (%rsi), %xmm0 > + VMOVU -15(%rsi, %rdx), %xmm1 > + VMOVU %xmm0, (%rdi) > + VMOVU %xmm1, -15(%END_REG) > + ret > +# endif > + > +END(STRCPY) > #endif > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S > index 0dcea18dbb..7272deef2c 100644 > --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S > @@ -1,3 +1,3 @@ > -#define USE_AS_STRNCAT > -#define STRCAT __strncat_avx2_rtm > -#include "strcat-avx2-rtm.S" > +#define STRNCAT __strncat_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > +#include "strncat-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S > index 52ecbca943..ffa58bd0de 100644 > --- a/sysdeps/x86_64/multiarch/strncat-avx2.S > +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S > @@ -1,7 +1,419 @@ > -#ifndef STRNCAT > -# define STRNCAT __strncat_avx2 > -#endif > +/* strncat with AVX2 > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRNCAT > +# define STRNCAT __strncat_avx2 > +# endif > + > +# ifdef USE_AS_WCSCPY > +# define MOVCHAR movl > +# define VPCMPEQ vpcmpeqd > +# define VPMIN vpminud > +# define CHAR_SIZE 4 > +# else > +# define MOVCHAR movb > +# define VPCMPEQ vpcmpeqb > +# define VPMIN vpminub > +# define CHAR_SIZE 1 > +# endif > + > +# include "strncpy-or-cat-overflow-def.h" > + > +# define PAGE_SIZE 4096 > + > +# define VZERO VMM(7) > +# define VZERO_128 VMM_128(7) > + > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRNCAT) > + /* Filter zero length strings and very long strings. Zero > + length strings just return, very long strings are handled by > + using the non-length variant {wcs|str}cat. */ > + movq %rdi, %rax > +# ifdef USE_AS_WCSCPY > + leaq -1(%rdx), %rcx > + shr $56, %rcx > + jnz L(zero_len) > + salq $2, %rdx > +# else > + test %rdx, %rdx > + jl L(zero_len) > +# endif > + vpxor %VZERO_128, %VZERO_128, %VZERO_128 > + > +# include "strcat-strlen-avx2.h.S" > + > + movl %esi, %ecx > + andl $(PAGE_SIZE - 1), %ecx > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx > + ja L(page_cross) > +L(page_cross_continue): > + VMOVU (%rsi), %VMM(0) > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + tzcnt %ecx, %r8d > + cmpq %r8, %rdx > + jbe L(less_1x_vec) > + > + testl %ecx, %ecx > + jz L(more_1x_vec) > + > + /* Hoist this to save code size. */ > + > + movl %r8d, %edx > + > +L(less_1x_vec): > + COND_VZEROUPPER > + > + cmpl $16, %edx > + jae L(copy_16_31) > + cmpl $8, %edx > + jae L(copy_8_15) > + > + > +# ifdef USE_AS_WCSCPY > + vmovd %VMM_128(0), (%rdi) > + MOVCHAR $0, (%rdi, %rdx) > + ret > +# else > + cmpl $4, %edx > + jae L(copy_4_7) > + > + movzbl (%rsi), %ecx > + cmpl $1, %edx > + jbe L(set_null_term) > + > + /* NB: make this `vmovw` if support for AVX512-FP16 is added. > + */ > + movzwl 1(%rsi), %esi > + movw %si, 1(%rdi) > + > + .p2align 4,, 1 > +L(set_null_term): > + movb %cl, (%rdi) > + MOVCHAR $0, (%rdi, %rdx) > + ret > + > + .p2align 4,, 11 > +L(copy_4_7): > + movl -(4)(%rsi, %rdx), %ecx > + vmovd %xmm0, (%rdi) > + movl %ecx, -(4)(%rdi, %rdx) > + MOVCHAR $0, (%rdi, %rdx) > + ret > +# endif > + > + > + .p2align 4,, 10 > +L(copy_16_31): > + VMOVU -(16)(%rsi, %rdx), %xmm1 > + VMOVU %xmm0, (%rdi) > + VMOVU %xmm1, -(16)(%rdi, %rdx) > + MOVCHAR $0, (%rdi, %rdx) > + ret > + > + .p2align 4,, 10 > +L(copy_8_15): > + movq -(8)(%rsi, %rdx), %rcx > + vmovq %xmm0, (%rdi) > + movq %rcx, -(8)(%rdi, %rdx) > + MOVCHAR $0, (%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > + .p2align 6,, 14 > +L(more_1x_vec): > + VMOVU %VMM(0), (%rdi) > + > + /* Align rsi (src) and just rdx/rdi (length/dst). */ > + addq %rsi, %rdx > + subq %rsi, %rdi > + orq $(VEC_SIZE - 1), %rsi > + incq %rsi > + addq %rsi, %rdi > +L(loop_last_4x_vec): > + subq %rsi, %rdx > + VMOVA 0(%rsi), %VMM(1) > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > +L(last_2x_vec): > + tzcnt %ecx, %ecx > + cmpl %ecx, %edx > + jbe L(ret_vec_x1_len) > + > + cmpl $VEC_SIZE, %ecx > + jnz L(ret_vec_x1) > + > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) > + VMOVU %VMM(1), (%rdi) > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + addl $-VEC_SIZE, %edx > + bzhil %edx, %ecx, %r8d > + jz L(ret_vec_x2_len) > +L(ret_vec_x2): > + bsfl %ecx, %edx > +L(ret_vec_x2_len): > + VMOVU (%rsi, %rdx), %VMM(0) > + MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx) > + VMOVU %VMM(0), (%rdi, %rdx) > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > + > + > + .p2align 4,, 12 > +L(ret_vec_x1_len): > + movl %edx, %ecx > +L(ret_vec_x1): > + VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1) > + MOVCHAR $0, (%rdi, %rcx) > + VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx) > + VZEROUPPER_RETURN > + > + .p2align 4,, 8 > +L(last_4x_vec): > + subq $-(VEC_SIZE * 4), %rsi > + VMOVA 0(%rsi), %VMM(1) > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + subq $-(VEC_SIZE * 4), %rdi > + addl $-(VEC_SIZE * 4), %edx > + cmpl $(VEC_SIZE * 2), %edx > + jbe L(last_2x_vec) > + .p2align 4,, 8 > +L(more_2x_vec): > + /* L(ret_vec_x1) expects ecx to have position of first match so > + test with bsf. */ > + bsfl %ecx, %ecx > + jnz L(ret_vec_x1) > + > + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) > + VMOVU %VMM(1), (%rdi) > + > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + > > -#define USE_AS_STRNCAT > -#define STRCAT STRNCAT > -#include "strcat-avx2.S" > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) > + VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi) > + > + VPCMPEQ %VMM(3), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + /* Check if length is greater than 4x VEC. */ > + cmpq $(VEC_SIZE * 4), %rdx > + ja L(more_4x_vec) > + > + addl $(VEC_SIZE * -2), %edx > + > + tzcnt %ecx, %ecx > + cmpl %ecx, %edx > + jbe L(ret_vec_x3_len) > + > + cmpl $VEC_SIZE, %ecx > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi) > + VPCMPEQ %VMM(4), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + addl $-VEC_SIZE, %edx > + bzhil %edx, %ecx, %r8d > + jz L(ret_vec_x4_len) > +L(ret_vec_x4): > + bsfl %ecx, %edx > +L(ret_vec_x4_len): > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0) > + MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx) > + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx) > + VZEROUPPER_RETURN > + > + .p2align 4,, 4 > +L(ret_vec_x3_len): > + movl %edx, %ecx > +L(ret_vec_x3): > + VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0) > + MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx) > + VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx) > + VZEROUPPER_RETURN > + > + > + .p2align 4,, 8 > +L(more_4x_vec): > + bsfl %ecx, %ecx > + jnz L(ret_vec_x3) > + > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) > + VPCMPEQ %VMM(4), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x4) > + > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) > + > + > + /* Recheck length before aligning. */ > + cmpq $(VEC_SIZE * 8), %rdx > + jbe L(last_4x_vec) > + > + /* Align rsi (src) and just rdx/rdi (length/dst). */ > + addq %rsi, %rdx > + subq %rsi, %rdi > + subq $-(VEC_SIZE * 4), %rsi > + andq $(VEC_SIZE * -4), %rsi > + > + /* Do first half of loop ahead of time so loop can just start by > + storing. */ > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %r8d > + addq %rsi, %rdi > + testl %r8d, %r8d > + jnz L(loop_4x_done) > + > + /* Use r9 for end of region before handling last 4x VEC > + specially. */ > + leaq -(VEC_SIZE * 4)(%rdx), %r9 > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > + subq $(VEC_SIZE * -4), %rsi > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) > + > + subq $(VEC_SIZE * -4), %rdi > + cmpq %rsi, %r9 > + jbe L(loop_last_4x_vec) > + > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + > + vpmovmskb %VMM(6), %r8d > + > + testl %r8d, %r8d > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + /* L(ret_vec_x1) expects ecx to have position of first match so > + test with bsf. */ > + bsfl %ecx, %ecx > + jnz L(ret_vec_x1) > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > + > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > + > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + bsfl %ecx, %ecx > + jnz L(ret_vec_x3) > + > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > + bsfl %r8d, %r8d > + VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1) > + VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8) > + VZEROUPPER_RETURN > + > + > + > + .p2align 4,, 4 > +L(page_cross): > + movq %rsi, %r8 > + andq $(VEC_SIZE * -1), %r8 > + > + VPCMPEQ (%r8), %VZERO, %VMM(6) > + > + vpmovmskb %VMM(6), %ecx > + shrxl %esi, %ecx, %ecx > + > + subl %esi, %r8d > + andl $(VEC_SIZE - 1), %r8d > + cmpq %r8, %rdx > + jb L(page_cross_small) > + > + /* Optimizing more aggressively for space as this is very cold > + code. This saves 2x cache lines. */ > + > + /* This adds once to the later result which will get correct > + copy bounds. NB: this can never zero-out a non-zero RCX as > + to be in the page cross case rsi cannot be aligned and we > + already right-shift rcx by the misalignment. */ > + shll $CHAR_SIZE, %ecx > + jz L(page_cross_continue) > + bsfl %ecx, %ecx > + rep movsb > + VZEROUPPER_RETURN > + > +L(page_cross_small): > + tzcntl %ecx, %ecx > + jz L(page_cross_setz) > + cmpl %edx, %ecx > + cmova %edx, %ecx > + rep movsb > +L(page_cross_setz): > + MOVCHAR $0, (%rdi) > + VZEROUPPER_RETURN > +L(zero_len): > +# ifdef USE_AS_WCSCPY > + test %rdx, %rdx > +# endif > + jnz OVERFLOW_STRCAT > + ret > + > + > +END(STRNCAT) > +#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S > index 79e7083299..d42ad88b3d 100644 > --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S > @@ -1,3 +1,3 @@ > -#define USE_AS_STRNCPY > -#define STRCPY __strncpy_avx2_rtm > -#include "strcpy-avx2-rtm.S" > +#define STRNCPY __strncpy_avx2_rtm > +#include "x86-avx-rtm-vecs.h" > +#include "strncpy-avx2.S" > diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S > index ce634e94fa..e9afd8fbed 100644 > --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S > +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S > @@ -1,7 +1,735 @@ > -#ifndef STRNCPY > -# define STRNCPY __strncpy_avx2 > -#endif > +/* strncpy with AVX2 > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#include <isa-level.h> > + > +#if ISA_SHOULD_BUILD (3) > + > +# include <sysdep.h> > + > + > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > + > +# ifndef STRNCPY > +# define STRNCPY __strncpy_avx2 > +# endif > + > + > +# ifdef USE_AS_WCSCPY > +# define VPCMPEQ vpcmpeqd > +# define VPMIN vpminud > +# define CHAR_SIZE 4 > +# else > +# define VPCMPEQ vpcmpeqb > +# define VPMIN vpminub > +# define CHAR_SIZE 1 > +# endif > + > +# include "strncpy-or-cat-overflow-def.h" > + > +# define PAGE_SIZE 4096 > + > +# define VZERO VMM(7) > +# define VZERO_128 VMM_128(7) > + > + > + .section SECTION(.text), "ax", @progbits > +ENTRY(STRNCPY) > + /* Filter zero length strings and very long strings. Zero > + length strings just return, very long strings are handled by > + just running rep stos{b|l} to zero set (which will almost > + certainly segfault), if that succeeds then just calling > + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ > +# ifdef USE_AS_WCSCPY > + decq %rdx > + movq %rdx, %rax > + /* 56 is end of max supported address space. */ > + shr $56, %rax > + jnz L(zero_len) > + salq $2, %rdx > +# else > + decq %rdx > + /* `dec` can macrofuse with `jl`. If the flag needs to become > + `jb` replace `dec` with `sub`. */ > + jl L(zero_len) > +# endif > + > + vpxor %VZERO_128, %VZERO_128, %VZERO_128 > + movl %esi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > +L(page_cross_continue): > + VMOVU (%rsi), %VMM(0) > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + /* If no STPCPY just save end ahead of time. */ > +# ifndef USE_AS_STPCPY > + movq %rdi, %rax > +# elif defined USE_AS_WCSCPY > + /* Clear dependency as nearly all return code for wcpncpy uses > + `setc %al`. */ > + xorl %eax, %eax > +# endif > + > + cmpq $(VEC_SIZE - CHAR_SIZE), %rdx > + /* `jb` because length rdx is now length - CHAR_SIZE. */ > + jbe L(less_1x_vec) > + > + /* This may overset but thats fine because we still need to zero > + fill. */ > + VMOVU %VMM(0), (%rdi) > + > + testl %ecx, %ecx > + jnz L(zfill) > + > + /* Align. */ > + addq %rsi, %rdx > + subq %rsi, %rdi > + orq $(VEC_SIZE - 1), %rsi > + incq %rsi > +L(last_4x_vec): > + addq %rsi, %rdi > +L(loop_last_4x_vec): > + subq %rsi, %rdx > + > + > + VMOVA 0(%rsi), %VMM(1) > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + cmpq $(VEC_SIZE * 2), %rdx > + jae L(more_2x_vec) > + > + cmpl $(VEC_SIZE), %edx > + jb L(ret_vec_x1_len) > + > + testl %ecx, %ecx > + jnz L(ret_vec_x1) > + > + VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6) > + VMOVU %VMM(1), (%rdi) > + vpmovmskb %VMM(6), %ecx > + shlq $VEC_SIZE, %rcx > +L(ret_vec_x1_len): > + tzcntq %rcx, %rcx > + cmpl %ecx, %edx > + jbe L(ret_vec_x1_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > +L(ret_vec_x1_len_no_zfill_mov): > + movl %ecx, %edx > +# ifdef USE_AS_STPCPY > + /* clear flags. */ > + xorl %ecx, %ecx > +# endif > +L(ret_vec_x1_len_no_zfill): > + VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) > + VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + setc %al > + addq %rdx, %rdi > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# endif > +L(return_vzeroupper): > + ZERO_UPPER_VEC_REGISTERS_RETURN > + > + .p2align 4,, 6 > +L(ret_vec_x1): > + bsfl %ecx, %ecx > + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > + subl %ecx, %edx > + /* Check if we need to reload/store. */ > + cmpl $VEC_SIZE, %edx > + jb L(ret_vec_x1_len_no_zfill_mov) > + /* Otherwise safe to just store directly. */ > + VMOVU %VMM(1), (%rdi) > + VMOVU %VZERO, (%rdi, %rcx) > +# ifdef USE_AS_STPCPY > + leaq (%rdi, %rcx), %rax > +# endif > + VZEROUPPER_RETURN > + > + .p2align 4,, 12 > +L(more_2x_vec): > + VMOVU %VMM(1), (%rdi) > + testl %ecx, %ecx > + /* Must fill at least 2x VEC. */ > + jnz L(zfill_vec1) > + > + VMOVA VEC_SIZE(%rsi), %VMM(2) > + VMOVU %VMM(2), VEC_SIZE(%rdi) > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + /* Must fill at least 1x VEC. */ > + jnz L(zfill_vec2) > + > + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) > + VPCMPEQ %VMM(3), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + > + /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len - > + CHAR_SIZE. */ > + cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx > + ja L(more_4x_vec) > + > + subl $(VEC_SIZE * 3), %edx > + jb L(ret_vec_x3_len) > + > + testl %ecx, %ecx > + jnz L(ret_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) > + vpmovmskb %VMM(6), %ecx > + tzcntl %ecx, %ecx > + cmpl %ecx, %edx > + jbe L(ret_vec_x4_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > + movl %ecx, %edx > +L(ret_vec_x4_len_no_zfill): > + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) > + VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + setc %al > + addq %rdx, %rdi > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > +# else > + leal (VEC_SIZE * 3 + 0)(%edx), %eax > + adcq %rdi, %rax > +# endif > +# endif > + VZEROUPPER_RETURN > + > + > +L(ret_vec_x3_len): > + addl $(VEC_SIZE * 1), %edx > + tzcntl %ecx, %ecx > + cmpl %ecx, %edx > + jbe L(ret_vec_x3_len_no_zfill) > + /* Fall through (expectation) is copy len < buffer len. */ > + VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > +L(ret_vec_x3_len_no_zfill_mov): > + movl %ecx, %edx > +# ifdef USE_AS_STPCPY > + /* clear flags. */ > + xorl %ecx, %ecx > +# endif > + .p2align 4,, 4 > +L(ret_vec_x3_len_no_zfill): > + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) > + VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + setc %al > + addq %rdx, %rdi > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > +# else > + leal (VEC_SIZE * 2 + 0)(%rdx), %eax > + adcq %rdi, %rax > +# endif > +# endif > + VZEROUPPER_RETURN > + > + > + .p2align 4,, 8 > +L(ret_vec_x3): > + bsfl %ecx, %ecx > + VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx) > + subl %ecx, %edx > + jl L(ret_vec_x3_len_no_zfill_mov) > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) > + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx) > +# ifdef USE_AS_STPCPY > + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax > +# endif > + VZEROUPPER_RETURN > + > + .p2align 4,, 8 > +L(more_4x_vec): > + > + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) > + testl %ecx, %ecx > + jnz L(zfill_vec3) > + > + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) > + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) > + VPCMPEQ %VMM(4), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(zfill_vec4) > + > + movq %rdx, %rcx > + addq %rsi, %rdx > + subq %rsi, %rdi > + subq $-(VEC_SIZE * 4), %rsi > + /* Recheck length before aligning. */ > + cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx > + jbe L(last_4x_vec) > + > + andq $(VEC_SIZE * -4), %rsi > + > + /* Do first half of loop ahead of time so loop can just start by > + storing. */ > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %r8d > + addq %rsi, %rdi > + testl %r8d, %r8d > + jnz L(loop_4x_done) > + > + /* Use r9 as end register. */ > + leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9 > > -#define USE_AS_STRNCPY > -#define STRCPY STRNCPY > -#include "strcpy-avx2.S" > + .p2align 4,, 11 > +L(loop_4x_vec): > + > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > + subq $(VEC_SIZE * -4), %rsi > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) > + > + subq $(VEC_SIZE * -4), %rdi > + cmpq %rsi, %r9 > + jbe L(loop_last_4x_vec) > + > + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) > + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) > + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) > + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) > + > + VPMIN %VMM(0), %VMM(1), %VMM(4) > + VPMIN %VMM(2), %VMM(3), %VMM(6) > + VPMIN %VMM(4), %VMM(6), %VMM(6) > + VPCMPEQ %VMM(6), %VZERO, %VMM(6) > + > + vpmovmskb %VMM(6), %r8d > + > + testl %r8d, %r8d > + jz L(loop_4x_vec) > + > +L(loop_4x_done): > + subq %rsi, %rdx > + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) > + VPCMPEQ %VMM(0), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(zfill_vec1) > + > + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) > + VPCMPEQ %VMM(1), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(zfill_vec2) > + > + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) > + VPCMPEQ %VMM(2), %VZERO, %VMM(6) > + vpmovmskb %VMM(6), %ecx > + testl %ecx, %ecx > + jnz L(zfill_vec3) > + > + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) > + movl %r8d, %ecx > + > + // Zfill more.... > + > + .p2align 4,, 4 > +L(zfill_vec4): > + addq $(VEC_SIZE * 2), %rdi > + subq $(VEC_SIZE * 2), %rdx > +L(zfill_vec2): > + shlq $VEC_SIZE, %rcx > +L(zfill): > + bsfq %rcx, %rcx > + subq %rcx, %rdx > + addq %rcx, %rdi > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_from_page_cross): > + cmpq $VEC_SIZE, %rdx > + jb L(zfill_less_vec_vzeroupper) > + > +L(zfill_more_1x_vec): > + VMOVU %VZERO, CHAR_SIZE(%rdi) > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) > + cmpq $(VEC_SIZE * 2), %rdx > + jae L(zfill_more_2x_vec) > +L(zfill_done0): > + VZEROUPPER_RETURN > + > + .p2align 4,, 8 > +L(zfill_vec3): > + addq $(VEC_SIZE * 2), %rdi > + subq $(VEC_SIZE * 2), %rdx > + .p2align 4,, 2 > +L(zfill_vec1): > + bsfl %ecx, %ecx > + addq %rcx, %rdi > + subq %rcx, %rdx > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + /* zfill from vec1/vec3 must have to set at least 2x VECS. */ > + > + VMOVU %VZERO, CHAR_SIZE(%rdi) > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) > + cmpq $(VEC_SIZE * 2), %rdx > + jb L(zfill_done0) > +L(zfill_more_2x_vec): > + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx) > + VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi) > + subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx > + jbe L(zfill_done) > + > + addq %rdi, %rdx > + VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi) > + VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi) > + > + > + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) > + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) > + > + subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi > + cmpq %rdi, %rdx > + jbe L(zfill_done) > + > + andq $-(VEC_SIZE), %rdi > + .p2align 4,, 12 > +L(zfill_loop_4x_vec): > + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) > + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) > + subq $-(VEC_SIZE * 4), %rdi > + cmpq %rdi, %rdx > + ja L(zfill_loop_4x_vec) > +L(zfill_done): > + VZEROUPPER_RETURN > + > + > + .p2align 4,, 8 > +L(copy_1x): > + VMOVU %VMM(0), (%rdi) > + testl %ecx, %ecx > + jz L(ret_32_32) > +L(zfill_less_vec): > + bsfl %ecx, %ecx > +L(zfill_less_vec_no_bsf): > + subq %rcx, %rdx > + addq %rcx, %rdi > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_less_vec_vzeroupper): > + COND_VZEROUPPER > + /* We are taking advantage of the fact that to be here we must > + be writing null-term as (%rdi, %rcx) we have a byte of lee- > + way for overwriting. */ > + cmpl $16, %edx > + jb L(zfill_less_16) > + VMOVU %VZERO_128, (%rdi) > + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx) > + ret > +# ifdef USE_AS_STPCPY > +L(ret_32_32): > + leaq CHAR_SIZE(%rdi, %rdx), %rax > + VZEROUPPER_RETURN > +# endif > + > + .p2align 4,, 4 > +L(copy_16_31): > + /* Overfill to avoid branches. */ > + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 > + vmovdqu %xmm0, (%rdi) > + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx) > + cmpl %ecx, %edx > + ja L(zfill_less_vec_no_bsf) > +# ifndef USE_AS_STPCPY > +L(ret_32_32): > +# else > +# ifdef USE_AS_WCSCPY > + setc %al > + addq %rdx, %rdi > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# endif > + VZEROUPPER_RETURN > + > + .p2align 4,, 4 > +L(copy_8_15): > + /* Overfill to avoid branches. */ > + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi > + vmovq %xmm0, (%rdi) > + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx) > + cmpl %ecx, %edx > + jbe L(ret_8_15) > + subq %rcx, %rdx > + addq %rcx, %rdi > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + .p2align 4,, 8 > +L(zfill_less_16): > + xorl %ecx, %ecx > + cmpl $8, %edx > + jb L(zfill_less_8) > + movq %rcx, (%rdi) > + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx) > +# ifndef USE_AS_STPCPY > +L(ret_8_15): > +# endif > + ret > + > + > + .p2align 4,, 8 > +L(less_1x_vec): > + /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many > + buffer sizes are aligned conventially. */ > + je L(copy_1x) > + > + tzcntl %ecx, %ecx > + cmpl $16, %edx > + jae L(copy_16_31) > + > + COND_VZEROUPPER > + cmpl $8, %edx > + jae L(copy_8_15) > +# ifdef USE_AS_WCSCPY > + testl %ecx, %ecx > + jz L(zfill_less_8_set_ret) > + > + movl (%rsi, %rdx), %esi > + vmovd %xmm0, (%rdi) > + movl %esi, (%rdi, %rdx) > + > +# ifdef USE_AS_STPCPY > + cmpl %ecx, %edx > +L(ret_8_15): > + setc %al > + addq %rdx, %rdi > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# endif > + ret > +L(zfill_less_8_set_ret): > + xorl %ecx, %ecx > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > +L(zfill_less_8): > + movl %ecx, (%rdi) > + movl %ecx, (%rdi, %rdx) > + ret > + > +# else > + cmpl $3, %edx > + jb L(copy_0_3) > + /* Overfill to avoid branches. */ > + movl -3(%rsi, %rdx), %esi > + vmovd %xmm0, (%rdi) > + movl %esi, -3(%rdi, %rdx) > + cmpl %ecx, %edx > + jbe L(ret_4_7) > + subq %rcx, %rdx > + addq %rcx, %rdi > +# ifdef USE_AS_STPCPY > + movq %rdi, %rax > +# endif > + xorl %ecx, %ecx > + .p2align 4,, 8 > +L(zfill_less_8): > + cmpl $3, %edx > + jb L(zfill_less_3) > + movl %ecx, (%rdi) > + movl %ecx, -3(%rdi, %rdx) > +# ifdef USE_AS_STPCPY > + ret > +# endif > + > +L(ret_4_7): > +# ifdef USE_AS_STPCPY > +L(ret_8_15): > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > + ret > + > + .p2align 4,, 4 > +L(zfill_less_3): > + testl %edx, %edx > + jz L(zfill_1) > + movw %cx, (%rdi) > +L(zfill_1): > + movb %cl, (%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > +L(copy_0_3): > + vmovd %xmm0, %r8d > + testl %edx, %edx > + jz L(copy_1) > + movw %r8w, (%rdi) > + cmpl %ecx, %edx > + ja L(zfill_from_1) > + movzbl (%rsi, %rdx), %r8d > +# ifdef USE_AS_STPCPY > + movl %edx, %eax > + adcq %rdi, %rax > + movb %r8b, (%rdi, %rdx) > + ret > +# endif > + > +L(copy_1): > +# ifdef USE_AS_STPCPY > + movl %edx, %eax > + cmpl %ecx, %edx > + adcq %rdi, %rax > +# endif > +# ifdef USE_AS_WCSCPY > + vmovd %xmm0, (%rdi) > +# else > + movb %r8b, (%rdi, %rdx) > +# endif > + ret > +# endif > + > + .p2align 4,, 2 > +L(zero_len): > + movq %rdi, %rax > + ret > +# ifndef USE_AS_WCSCPY > + .p2align 4,, 8 > +L(zfill_from_1): > +# ifdef USE_AS_STPCPY > + leaq (%rdi, %rcx), %rax > +# endif > + movw $0, -1(%rdi, %rdx) > + ret > +# endif > + > + .p2align 4,, 4 > + .p2align 6,, 8 > +L(page_cross): > + movq %rsi, %rax > + andq $(VEC_SIZE * -1), %rax > + > + VPCMPEQ (%rax), %VZERO, %VMM(6) > + > + vpmovmskb %VMM(6), %ecx > + shrxl %esi, %ecx, %ecx > + > + subl %esi, %eax > + andl $(VEC_SIZE - 1), %eax > + cmpq %rax, %rdx > + jb L(page_cross_small) > + /* Optimizing more aggressively for space as this is very cold > + code. This saves 2x cache lines. */ > + > + /* If rcx is non-zero then continue. */ > + shl $CHAR_SIZE, %ecx > + jz L(page_cross_continue) > + bsf %ecx, %ecx > + > + subq %rcx, %rdx > +# ifdef USE_AS_STPCPY > + leaq -CHAR_SIZE(%rdi, %rcx), %rax > +# else > + movq %rdi, %rax > +# endif > + > + rep movsb > +# ifdef USE_AS_WCSCPY > + movl $0, (%rdi) > +# else > + movb $0, (%rdi) > +# endif > + jmp L(zfill_from_page_cross) > + > +L(page_cross_small): > + tzcntl %ecx, %ecx > + xorl %eax, %eax > + cmpl %ecx, %edx > + jbe L(page_cross_copy_only) > + > + /* Do a zfill of the tail before copying. */ > + movq %rdi, %r9 > + movl %ecx, %r8d > + > + subl %ecx, %edx > + leaq CHAR_SIZE(%rdi, %rcx), %rdi > + movl %edx, %ecx > + rep stosb > + movq %r9, %rdi > + movl %r8d, %edx > +L(page_cross_copy_only): > + leal CHAR_SIZE(%rdx), %ecx > +# ifdef USE_AS_STPCPY > +# ifdef USE_AS_WCSCPY > + setc %al > + addq %rdi, %rdx > + leaq (%rdx, %rax, CHAR_SIZE), %rax > +# else > + movl %edx, %eax > + adcq %rdi, %rax > +# endif > +# else > + movq %rdi, %rax > +# endif > + rep movsb > + ret > + > + > +L(best_effort_strncpy): > + movq %rdx, %rcx > + xorl %eax, %eax > + movq %rdi, %r8 > + /* The length is >= 2^63. We very much so expect to segfault at > + rep stos. If that doesn't happen then just strcpy to finish. > + */ > +# ifdef USE_AS_WCSCPY > + rep stosl > +# else > + rep stosb > +# endif > + movq %r8, %rdi > + jmp OVERFLOW_STRCPY > +END(STRNCPY) > +#endif > diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h > index dca1089060..275af7560a 100644 > --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h > +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h > @@ -27,7 +27,8 @@ > #define VEC_SIZE 32 > #include "x86-vec-macros.h" > > -#define USE_WITH_AVX 1 > +#define USE_WITH_AVX2 1 > + > #define SECTION(p) p##.avx > > /* 4-byte mov instructions with AVX2. */ > -- > 2.34.1 > LGTM. Thanks. H.J.
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S index 2b9c07a59f..90e532dbe8 100644 --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S @@ -1,3 +1,3 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_avx2_rtm -#include "strcpy-avx2-rtm.S" +#define STPCPY __stpcpy_avx2_rtm +#include "x86-avx-rtm-vecs.h" +#include "stpcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S index 60a2ccfe53..46ee07be36 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -1,4 +1,3 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_avx2_rtm -#include "strcpy-avx2-rtm.S" +#define STPNCPY __stpncpy_avx2_rtm +#include "x86-avx-rtm-vecs.h" +#include "stpncpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S index b2f8c19143..a46a8edbe2 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S @@ -3,6 +3,5 @@ #endif #define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY STPNCPY -#include "strcpy-avx2.S" +#define STRNCPY STPNCPY +#include "strncpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S index 637fb557c4..e84f4f1fef 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -1,12 +1,3 @@ -#ifndef STRCAT -# define STRCAT __strcat_avx2_rtm -#endif - -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST - -#define VZEROUPPER_RETURN jmp L(return_vzeroupper) - -#define SECTION(p) p##.avx.rtm - +#define STRCAT __strcat_avx2_rtm +#include "x86-avx-rtm-vecs.h" #include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S index d9b7fb2a43..3f914fa342 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -16,266 +16,10 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (3) - - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_avx2 -# endif - -# define USE_AS_STRCAT - -/* Number of bytes in a vector register */ -# define VEC_SIZE 32 - -# ifndef SECTION -# define SECTION(p) p##.avx -# endif - - .section SECTION(.text),"ax",@progbits -ENTRY (STRCAT) - mov %rdi, %r9 -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - xor %eax, %eax - mov %edi, %ecx - and $((VEC_SIZE * 4) - 1), %ecx - vpxor %xmm6, %xmm6, %xmm6 - cmp $(VEC_SIZE * 3), %ecx - ja L(fourth_vector_boundary) - vpcmpeqb (%rdi), %ymm6, %ymm0 - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_first_vector) - mov %rdi, %rax - and $-VEC_SIZE, %rax - jmp L(align_vec_size_start) -L(fourth_vector_boundary): - mov %rdi, %rax - and $-VEC_SIZE, %rax - vpcmpeqb (%rax), %ymm6, %ymm0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - vpmovmskb %ymm0, %edx - and %r10d, %edx - jnz L(exit) - -L(align_vec_size_start): - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 - vpmovmskb %ymm3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 - add $(VEC_SIZE * 4), %rax - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 - vpmovmskb %ymm3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 - add $(VEC_SIZE * 4), %rax - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 - vpmovmskb %ymm3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 - add $(VEC_SIZE * 4), %rax - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 - vpmovmskb %ymm3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 - add $(VEC_SIZE * 5), %rax - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 - add $VEC_SIZE, %rax - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 - add $VEC_SIZE, %rax - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 - add $VEC_SIZE, %rax - vpmovmskb %ymm3, %edx - test %edx, %edx - jnz L(exit) - - add $VEC_SIZE, %rax - - .p2align 4 -L(align_four_vec_loop): - vmovaps (%rax), %ymm4 - vpminub VEC_SIZE(%rax), %ymm4, %ymm4 - vmovaps (VEC_SIZE * 2)(%rax), %ymm5 - vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 - add $(VEC_SIZE * 4), %rax - vpminub %ymm4, %ymm5, %ymm5 - vpcmpeqb %ymm5, %ymm6, %ymm5 - vpmovmskb %ymm5, %edx - test %edx, %edx - jz L(align_four_vec_loop) - - vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 - sub $(VEC_SIZE * 5), %rax - vpmovmskb %ymm0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 - vpmovmskb %ymm1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 - vpmovmskb %ymm2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 - vpmovmskb %ymm3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_null_on_first_vector): - bsf %rdx, %rdx - add %rdx, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_second_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $VEC_SIZE, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_third_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 2), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fourth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 3), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fifth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - - .p2align 4 -L(StartStrcpyPart): - lea (%r9, %rax), %rdi - mov %rsi, %rcx - mov %r9, %rax /* save result */ - -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(ExitZero) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-avx2.S" +#ifndef STRCAT +# define STRCAT __strcat_avx2 #endif + +#define USE_AS_STRCAT +#define STRCPY STRCAT +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S new file mode 100644 index 0000000000..f50514e07c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S @@ -0,0 +1,101 @@ +/* strlen used for begining of str{n}cat using AVX2. + Copyright (C) 2011-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +/* NOTE: This file is meant to be included by strcat-avx2 or + strncat-avx2 and does not standalone. Before including %rdi + must be saved in %rax. */ + + +/* Simple strlen implementation that ends at + L(strcat_strlen_done). */ + movq %rdi, %r8 + andq $(VEC_SIZE * -1), %r8 + VPCMPEQ (%r8), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + shrxl %edi, %ecx, %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v0) + + VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + leaq (VEC_SIZE)(%r8), %rdi + testl %ecx, %ecx + jnz L(bsf_and_done_v0) + + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v2) + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v3) + + orq $(VEC_SIZE * 4 - 1), %rdi + .p2align 4,, 8 +L(loop_2x_vec): + VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0) + VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1) + VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2) + VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3) + VPMIN %VMM(1), %VMM(3), %VMM(3) + VPCMPEQ %VMM(3), %VZERO, %VMM(3) + vpmovmskb %VMM(3), %r8d + subq $(VEC_SIZE * -4), %rdi + testl %r8d, %r8d + jz L(loop_2x_vec) + + addq $(VEC_SIZE * -4 + 1), %rdi + + VPCMPEQ %VMM(0), %VZERO, %VMM(0) + vpmovmskb %VMM(0), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v0) + + VPCMPEQ %VMM(1), %VZERO, %VMM(1) + vpmovmskb %VMM(1), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v1) + + VPCMPEQ %VMM(2), %VZERO, %VMM(2) + vpmovmskb %VMM(2), %ecx + testl %ecx, %ecx + jnz L(bsf_and_done_v2) + + movl %r8d, %ecx +L(bsf_and_done_v3): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v2): + bsfl %ecx, %ecx + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi + jmp L(strcat_strlen_done) + + .p2align 4,, 4 +L(bsf_and_done_v1): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v0): + bsfl %ecx, %ecx + addq %rcx, %rdi +L(strcat_strlen_done): diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S index c2c581ecf7..3ae2de8ea9 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -1,12 +1,3 @@ -#ifndef STRCPY -# define STRCPY __strcpy_avx2_rtm -#endif - -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST - -#define VZEROUPPER_RETURN jmp L(return_vzeroupper) - -#define SECTION(p) p##.avx.rtm - +#define STRCPY __strcpy_avx2_rtm +#include "x86-avx-rtm-vecs.h" #include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S index c725834929..32f86baa4c 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -20,984 +20,378 @@ #if ISA_SHOULD_BUILD (3) +# include <sysdep.h> -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_avx2 -# endif - -# endif - -/* Number of bytes in a vector register */ # ifndef VEC_SIZE -# define VEC_SIZE 32 -# endif - -# ifndef VZEROUPPER -# define VZEROUPPER vzeroupper -# endif - -# ifndef SECTION -# define SECTION(p) p##.avx -# endif - -/* zero register */ -#define xmmZ xmm0 -#define ymmZ ymm0 - -/* mask register */ -#define ymmM ymm1 - -# ifndef USE_AS_STRCAT - - .section SECTION(.text),"ax",@progbits -ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP - test %R8_LP, %R8_LP - jz L(ExitZero) -# endif - mov %rsi, %rcx -# ifndef USE_AS_STPCPY - mov %rdi, %rax /* save result */ -# endif - +# include "x86-avx-vecs.h" # endif - vpxor %xmmZ, %xmmZ, %xmmZ - - and $((VEC_SIZE * 4) - 1), %ecx - cmp $(VEC_SIZE * 2), %ecx - jbe L(SourceStringAlignmentLessTwoVecSize) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - - vpcmpeqb (%rsi), %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - shr %cl, %rdx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - mov $VEC_SIZE, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# else - mov $(VEC_SIZE + 1), %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# endif - jbe L(CopyVecSizeTailCase2OrCase3) +# ifndef STRCPY +# define STRCPY __strcpy_avx2 # endif - test %edx, %edx - jnz L(CopyVecSizeTail) - vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 - vpmovmskb %ymm2, %edx + /* Use movsb in page cross case to save code size. */ +# define USE_MOVSB_IN_PAGE_CROSS 1 -# ifdef USE_AS_STRNCPY - add $VEC_SIZE, %r10 - cmp %r10, %r8 - jbe L(CopyTwoVecSizeCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize) - - vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ - vmovdqu %ymm2, (%rdi) - -/* If source address alignment != destination address alignment */ - .p2align 4 -L(UnalignVecSizeBoth): - sub %rcx, %rdi -# ifdef USE_AS_STRNCPY - add %rcx, %r8 - sbb %rcx, %rcx - or %rcx, %r8 -# endif - mov $VEC_SIZE, %rcx - vmovdqa (%rsi, %rcx), %ymm2 - vmovdqu %ymm2, (%rdi, %rcx) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 - vpcmpeqb %ymm2, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 3), %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) +# ifdef USE_AS_WCSCPY +# define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 # else - jnz L(CopyVecSize) +# define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 # endif - vmovdqu %ymm2, (%rdi, %rcx) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 - vpcmpeqb %ymm3, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) -# else - jnz L(CopyVecSize) -# endif +# define PAGE_SIZE 4096 - vmovdqu %ymm3, (%rdi, %rcx) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 - vpcmpeqb %ymm4, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) +# ifdef USE_AS_STPCPY +# define END_REG rax # else - jnz L(CopyVecSize) +# define END_REG rdi, %rdx # endif - vmovdqu %ymm4, (%rdi, %rcx) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 - vpcmpeqb %ymm2, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) +# ifdef USE_AS_STRCAT +# define PAGE_ALIGN_REG ecx # else - jnz L(CopyVecSize) +# define PAGE_ALIGN_REG eax # endif - vmovdqu %ymm2, (%rdi, %rcx) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 - vpcmpeqb %ymm2, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 - vmovdqu %ymm2, (%rdi, %rcx) - vpcmpeqb %ymm3, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) -# else - jnz L(CopyVecSize) -# endif + .section SECTION(.text), "ax", @progbits +ENTRY(STRCPY) + vpxor %VZERO_128, %VZERO_128, %VZERO_128 - vmovdqu %ymm3, (%rdi, %rcx) - mov %rsi, %rdx - lea VEC_SIZE(%rsi, %rcx), %rsi - and $-(VEC_SIZE * 4), %rsi - sub %rsi, %rdx - sub %rdx, %rdi -# ifdef USE_AS_STRNCPY - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 -# endif -L(UnalignedFourVecSizeLoop): - vmovdqa (%rsi), %ymm4 - vmovdqa VEC_SIZE(%rsi), %ymm5 - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 - vpminub %ymm5, %ymm4, %ymm2 - vpminub %ymm7, %ymm6, %ymm3 - vpminub %ymm2, %ymm3, %ymm3 - vpcmpeqb %ymmM, %ymm3, %ymm3 - vpmovmskb %ymm3, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %edx, %edx - jnz L(UnalignedFourVecSizeLeave) - -L(UnalignedFourVecSizeLoop_start): - add $(VEC_SIZE * 4), %rdi - add $(VEC_SIZE * 4), %rsi - vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) - vmovdqa (%rsi), %ymm4 - vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) - vmovdqa VEC_SIZE(%rsi), %ymm5 - vpminub %ymm5, %ymm4, %ymm2 - vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 - vmovdqu %ymm7, -VEC_SIZE(%rdi) - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 - vpminub %ymm7, %ymm6, %ymm3 - vpminub %ymm2, %ymm3, %ymm3 - vpcmpeqb %ymmM, %ymm3, %ymm3 - vpmovmskb %ymm3, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) -# endif - test %edx, %edx - jz L(UnalignedFourVecSizeLoop_start) - -L(UnalignedFourVecSizeLeave): - vpcmpeqb %ymm4, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_0) - - vpcmpeqb %ymm5, %ymmZ, %ymmM - vpmovmskb %ymmM, %ecx - test %ecx, %ecx - jnz L(CopyVecSizeUnaligned_16) - - vpcmpeqb %ymm6, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - test %edx, %edx - jnz L(CopyVecSizeUnaligned_32) - - vpcmpeqb %ymm7, %ymmZ, %ymmM - vpmovmskb %ymmM, %ecx - bsf %ecx, %edx - vmovdqu %ymm4, (%rdi) - vmovdqu %ymm5, VEC_SIZE(%rdi) - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax -# endif - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - add $(VEC_SIZE * 3), %rsi - add $(VEC_SIZE * 3), %rdi - jmp L(CopyVecSizeExit) +# ifdef USE_AS_STRCAT + movq %rdi, %rax +# include "strcat-strlen-avx2.h.S" # endif -/* If source address alignment == destination address alignment */ - -L(SourceStringAlignmentLessTwoVecSize): - vmovdqu (%rsi), %ymm3 - vmovdqu VEC_SIZE(%rsi), %ymm2 - vpcmpeqb %ymm3, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $VEC_SIZE, %r8 -# else - cmp $(VEC_SIZE + 1), %r8 -# endif - jbe L(CopyVecSizeTail1Case2OrCase3) + movl %esi, %PAGE_ALIGN_REG + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG + ja L(page_cross) +L(page_cross_continue): +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax # endif - test %edx, %edx - jnz L(CopyVecSizeTail1) - - vmovdqu %ymm3, (%rdi) - vpcmpeqb %ymm2, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - cmp $(VEC_SIZE * 2), %r8 -# else - cmp $((VEC_SIZE * 2) + 1), %r8 -# endif - jbe L(CopyTwoVecSize1Case2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize1) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - jmp L(UnalignVecSizeBoth) + VMOVU (%rsi), %VMM(0) + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx -/*------End of main part with loops---------------------*/ + testl %ecx, %ecx + jz L(more_1x_vec) -/* Case1 */ + /* No longer need ymm registers so just vzeroupper so it doesn't + need to be duplicated at each return statement. */ + COND_VZEROUPPER -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) - .p2align 4 -L(CopyVecSize): - add %rcx, %rdi -# endif -L(CopyVecSizeTail): - add %rcx, %rsi -L(CopyVecSizeTail1): - bsf %edx, %edx -L(CopyVecSizeExit): - cmp $32, %edx - jae L(Exit32_63) - cmp $16, %edx - jae L(Exit16_31) - cmp $8, %edx - jae L(Exit8_15) - cmp $4, %edx - jae L(Exit4_7) - cmp $3, %edx - je L(Exit3) - cmp $1, %edx - ja L(Exit2) - je L(Exit1) - movb $0, (%rdi) + xorl %edx, %edx + bsfl %ecx, %edx # ifdef USE_AS_STPCPY - lea (%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $1, %r8 - lea 1(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif -L(return_vzeroupper): - ZERO_UPPER_VEC_REGISTERS_RETURN - - .p2align 4 -L(CopyTwoVecSize1): - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $VEC_SIZE, %r8 -# endif - jmp L(CopyVecSizeTail1) - - .p2align 4 -L(CopyTwoVecSize): - bsf %edx, %edx - add %rcx, %rsi - add $VEC_SIZE, %edx - sub %ecx, %edx - jmp L(CopyVecSizeExit) - - .p2align 4 -L(CopyVecSizeUnaligned_0): - bsf %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - vmovdqu %ymm4, (%rdi) - add $((VEC_SIZE * 4) - 1), %r8 - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) -# else - jmp L(CopyVecSizeExit) -# endif - - .p2align 4 -L(CopyVecSizeUnaligned_16): - bsf %ecx, %edx - vmovdqu %ymm4, (%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea VEC_SIZE(%rdi, %rdx), %rax -# endif - vmovdqu %ymm5, VEC_SIZE(%rdi) - add $((VEC_SIZE * 3) - 1), %r8 - sub %rdx, %r8 - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) + leaq (%rdi, %rdx), %rax +# endif + + /* Use mask bits in rcx to detect which copy we need. If the low + mask is zero then there must be a bit set in the upper half. + I.e if ecx != 0 and cx == 0, then match must be upper 16 + bits so we use L(copy_16_31). */ + testw %cx, %cx + jz L(copy_16_31) + + testb %cl, %cl + jz L(copy_8_15) +# ifdef USE_AS_WCSCPY + vmovd %xmm0, (%rdi) + movl $0, (%END_REG) + ret # else - add $VEC_SIZE, %rsi - add $VEC_SIZE, %rdi - jmp L(CopyVecSizeExit) -# endif - - .p2align 4 -L(CopyVecSizeUnaligned_32): - bsf %edx, %edx - vmovdqu %ymm4, (%rdi) - vmovdqu %ymm5, VEC_SIZE(%rdi) -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -# ifdef USE_AS_STPCPY - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax -# endif - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) - add $((VEC_SIZE * 2) - 1), %r8 - sub %rdx, %r8 - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi - jmp L(StrncpyFillTailWithZero) + testb $0x7, %cl + jz L(copy_4_7) + + testl %edx, %edx + jz L(set_null_term) + vmovd %xmm0, %ecx + movw %cx, (%rdi) + + .p2align 4,, 2 +L(set_null_term): + movb $0, (%END_REG) + ret + + .p2align 4,, 12 +L(copy_4_7): + movl -3(%rsi, %rdx), %ecx + vmovd %xmm0, (%rdi) + movl %ecx, -3(%END_REG) + ret +# endif + + .p2align 4,, 10 +L(copy_16_31): + VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 + VMOVU %xmm0, (%rdi) + VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG) + ret + + .p2align 4,, 10 +L(copy_8_15): +# ifdef USE_AS_WCSCPY + movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx # else - add $(VEC_SIZE * 2), %rsi - add $(VEC_SIZE * 2), %rdi - jmp L(CopyVecSizeExit) -# endif - -# ifdef USE_AS_STRNCPY -# ifndef USE_AS_STRCAT - .p2align 4 -L(CopyVecSizeUnalignedVec6): - vmovdqu %ymm6, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec5): - vmovdqu %ymm5, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec4): - vmovdqu %ymm4, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) - - .p2align 4 -L(CopyVecSizeUnalignedVec3): - vmovdqu %ymm3, (%rdi, %rcx) - jmp L(CopyVecSizeVecExit) -# endif - -/* Case2 */ - - .p2align 4 -L(CopyVecSizeCase2): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2): - add %rcx, %rsi - bsf %edx, %edx - add $VEC_SIZE, %edx - sub %ecx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTailCase2): - add %rcx, %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -L(CopyVecSizeTail1Case2): - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) - jmp L(StrncpyExit) - -/* Case2 or Case3, Case3 */ - - .p2align 4 -L(CopyVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeCase2) -L(CopyVecSizeCase3): - add $VEC_SIZE, %r8 - add %rcx, %rdi - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSizeCase2OrCase3): - test %rdx, %rdx - jnz L(CopyTwoVecSizeCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyVecSizeTailCase2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTailCase2) - add %rcx, %rsi - jmp L(StrncpyExit) - - .p2align 4 -L(CopyTwoVecSize1Case2OrCase3): - add $VEC_SIZE, %rdi - add $VEC_SIZE, %rsi - sub $VEC_SIZE, %r8 -L(CopyVecSizeTail1Case2OrCase3): - test %rdx, %rdx - jnz L(CopyVecSizeTail1Case2) - jmp L(StrncpyExit) -# endif - -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ - - .p2align 4 -L(Exit1): - movzwl (%rsi), %edx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $2, %r8 - lea 2(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(Exit2): - movzwl (%rsi), %ecx - mov %cx, (%rdi) - movb $0, 2(%rdi) -# ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $3, %r8 - lea 3(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(Exit3): - mov (%rsi), %edx - mov %edx, (%rdi) + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx +# endif + vmovq %xmm0, (%rdi) + movq %rcx, -(8 - CHAR_SIZE)(%END_REG) + ret + + + .p2align 4,, 8 +L(more_1x_vec): +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + VMOVU %VMM(0), (%rdi) +# endif + subq %rsi, %rdi + orq $(VEC_SIZE - 1), %rsi + addq %rsi, %rdi + VMOVA 1(%rsi), %VMM(1) + + /* Try and order stores after as many loads as is reasonable to + avoid potential false dependencies. */ +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + VMOVU %VMM(0), (%rax) +# endif + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2) + VMOVU %VMM(1), 1(%rdi) + + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x2) + + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3) + VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi) + + VPCMPEQ %VMM(3), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi) + VPCMPEQ %VMM(4), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %edx + testl %edx, %edx + jnz L(ret_vec_x4) + + VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi) + + /* Subtract rsi from rdi before aligning. Adding back rsi will + get proper rdi (dst) for new src. */ + subq %rsi, %rdi + incq %rsi + orq $(VEC_SIZE * 4 - 1), %rsi + + /* Do first half of loop ahead of time so loop can just start by + storing. */ + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %edx + addq %rsi, %rdi + + testl %edx, %edx + jnz L(loop_4x_done) + + .p2align 4,, 11 +L(loop_4x_vec): + + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) + subq $(VEC_SIZE * -4), %rsi + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) + VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi) + + + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + + vpmovmskb %VMM(6), %edx + subq $(VEC_SIZE * -4), %rdi + testl %edx, %edx + jz L(loop_4x_vec) + +L(loop_4x_done): + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x1) + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi) + + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x2) + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi) + + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x3) + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi) +L(ret_vec_x4): + bsfl %edx, %edx + VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) + VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) # ifdef USE_AS_STPCPY - lea 3(%rdi), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub $4, %r8 - lea 4(%rdi), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax # endif +L(return_end): VZEROUPPER_RETURN - .p2align 4 -L(Exit4_7): - mov (%rsi), %ecx - mov %ecx, (%rdi) - mov -3(%rsi, %rdx), %ecx - mov %ecx, -3(%rdi, %rdx) + .p2align 4,, 8 +L(ret_vec_x1): + bsfl %ecx, %ecx + VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) + VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq 1(%rcx, %rdi), %rax # endif - VZEROUPPER_RETURN - - .p2align 4 -L(Exit8_15): - mov (%rsi), %rcx - mov -7(%rsi, %rdx), %r9 - mov %rcx, (%rdi) - mov %r9, -7(%rdi, %rdx) -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) -# endif - VZEROUPPER_RETURN +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN - .p2align 4 -L(Exit16_31): - vmovdqu (%rsi), %xmm2 - vmovdqu -15(%rsi, %rdx), %xmm3 - vmovdqu %xmm2, (%rdi) - vmovdqu %xmm3, -15(%rdi, %rdx) + .p2align 4,, 8 +L(ret_vec_x2): + bsfl %ecx, %ecx + VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) + VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax # endif VZEROUPPER_RETURN - .p2align 4 -L(Exit32_63): - vmovdqu (%rsi), %ymm2 - vmovdqu -31(%rsi, %rdx), %ymm3 - vmovdqu %ymm2, (%rdi) - vmovdqu %ymm3, -31(%rdi, %rdx) + .p2align 4,, 8 +L(ret_vec_x3): + bsfl %ecx, %ecx + VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1) + VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx) # ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - sub %rdx, %r8 - sub $1, %r8 - lea 1(%rdi, %rdx), %rdi - jnz L(StrncpyFillTailWithZero) + leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax # endif VZEROUPPER_RETURN -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyExit1): - movzbl (%rsi), %edx - mov %dl, (%rdi) + .p2align 4,, 4 +L(page_cross): + movq %rsi, %rcx + andq $(VEC_SIZE * -1), %rcx + + VPCMPEQ (%rcx), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + shrxl %esi, %ecx, %ecx +# if USE_MOVSB_IN_PAGE_CROSS + /* Optimizing more aggressively for space as this is very cold + code. This saves 2x cache lines. */ + + /* This adds once to the later result which will get correct + copy bounds. NB: this can never zero-out a non-zero RCX as + to be in the page cross case rsi cannot be aligned and we + already right-shift rcx by the misalignment. */ + shll $CHAR_SIZE, %ecx + jz L(page_cross_continue) + bsfl %ecx, %ecx +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT + movq %rdi, %rax +# endif + rep movsb # ifdef USE_AS_STPCPY - lea 1(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 1(%rdi) + leaq -CHAR_SIZE(%rdi), %rax # endif - VZEROUPPER_RETURN - .p2align 4 -L(StrncpyExit2): - movzwl (%rsi), %edx - mov %dx, (%rdi) -# ifdef USE_AS_STPCPY - lea 2(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 2(%rdi) -# endif VZEROUPPER_RETURN - .p2align 4 -L(StrncpyExit3_4): - movzwl (%rsi), %ecx - movzwl -2(%rsi, %r8), %edx - mov %cx, (%rdi) - mov %dx, -2(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(StrncpyExit5_8): - mov (%rsi), %ecx - mov -4(%rsi, %r8), %edx - mov %ecx, (%rdi) - mov %edx, -4(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(StrncpyExit9_16): - mov (%rsi), %rcx - mov -8(%rsi, %r8), %rdx - mov %rcx, (%rdi) - mov %rdx, -8(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(StrncpyExit17_32): - vmovdqu (%rsi), %xmm2 - vmovdqu -16(%rsi, %r8), %xmm3 - vmovdqu %xmm2, (%rdi) - vmovdqu %xmm3, -16(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(StrncpyExit33_64): - /* 0/32, 31/16 */ - vmovdqu (%rsi), %ymm2 - vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 - vmovdqu %ymm2, (%rdi) - vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) -# ifdef USE_AS_STPCPY - lea (%rdi, %r8), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi, %r8) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(StrncpyExit65): - /* 0/32, 32/32, 64/1 */ - vmovdqu (%rsi), %ymm2 - vmovdqu 32(%rsi), %ymm3 - mov 64(%rsi), %cl - vmovdqu %ymm2, (%rdi) - vmovdqu %ymm3, 32(%rdi) - mov %cl, 64(%rdi) -# ifdef USE_AS_STPCPY - lea 65(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, 65(%rdi) -# endif - VZEROUPPER_RETURN +# else + testl %ecx, %ecx + jz L(page_cross_continue) + /* Traditional copy case, essentially same as used in non-page- + cross case but since we can't reuse VMM(0) we need twice as + many loads from rsi. */ # ifndef USE_AS_STRCAT - - .p2align 4 -L(Fill1): - mov %dl, (%rdi) - VZEROUPPER_RETURN - - .p2align 4 -L(Fill2): - mov %dx, (%rdi) - VZEROUPPER_RETURN - - .p2align 4 -L(Fill3_4): - mov %dx, (%rdi) - mov %dx, -2(%rdi, %r8) - VZEROUPPER_RETURN - - .p2align 4 -L(Fill5_8): - mov %edx, (%rdi) - mov %edx, -4(%rdi, %r8) - VZEROUPPER_RETURN - - .p2align 4 -L(Fill9_16): - mov %rdx, (%rdi) - mov %rdx, -8(%rdi, %r8) - VZEROUPPER_RETURN - - .p2align 4 -L(Fill17_32): - vmovdqu %xmmZ, (%rdi) - vmovdqu %xmmZ, -16(%rdi, %r8) - VZEROUPPER_RETURN - - .p2align 4 -L(CopyVecSizeUnalignedVec2): - vmovdqu %ymm2, (%rdi, %rcx) - - .p2align 4 -L(CopyVecSizeVecExit): - bsf %edx, %edx - add $(VEC_SIZE - 1), %r8 - add %rcx, %rdi -# ifdef USE_AS_STPCPY - lea (%rdi, %rdx), %rax -# endif - sub %rdx, %r8 - lea 1(%rdi, %rdx), %rdi - - .p2align 4 -L(StrncpyFillTailWithZero): - xor %edx, %edx - sub $VEC_SIZE, %r8 - jbe L(StrncpyFillExit) - - vmovdqu %ymmZ, (%rdi) - add $VEC_SIZE, %rdi - - mov %rdi, %rsi - and $(VEC_SIZE - 1), %esi - sub %rsi, %rdi - add %rsi, %r8 - sub $(VEC_SIZE * 4), %r8 - jb L(StrncpyFillLessFourVecSize) - -L(StrncpyFillLoopVmovdqa): - vmovdqa %ymmZ, (%rdi) - vmovdqa %ymmZ, VEC_SIZE(%rdi) - vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) - vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) - add $(VEC_SIZE * 4), %rdi - sub $(VEC_SIZE * 4), %r8 - jae L(StrncpyFillLoopVmovdqa) - -L(StrncpyFillLessFourVecSize): - add $(VEC_SIZE * 2), %r8 - jl L(StrncpyFillLessTwoVecSize) - vmovdqa %ymmZ, (%rdi) - vmovdqa %ymmZ, VEC_SIZE(%rdi) - add $(VEC_SIZE * 2), %rdi - sub $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - vmovdqa %ymmZ, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillLessTwoVecSize): - add $VEC_SIZE, %r8 - jl L(StrncpyFillExit) - vmovdqa %ymmZ, (%rdi) - add $VEC_SIZE, %rdi - jmp L(Fill) - - .p2align 4 -L(StrncpyFillExit): - add $VEC_SIZE, %r8 -L(Fill): - cmp $17, %r8d - jae L(Fill17_32) - cmp $9, %r8d - jae L(Fill9_16) - cmp $5, %r8d - jae L(Fill5_8) - cmp $3, %r8d - jae L(Fill3_4) - cmp $1, %r8d - ja L(Fill2) - je L(Fill1) - VZEROUPPER_RETURN - -/* end of ifndef USE_AS_STRCAT */ + xorl %edx, %edx # endif - - .p2align 4 -L(UnalignedLeaveCase2OrCase3): - test %rdx, %rdx - jnz L(UnalignedFourVecSizeLeaveCase2) -L(UnalignedFourVecSizeLeaveCase3): - lea (VEC_SIZE * 4)(%r8), %rcx - and $-VEC_SIZE, %rcx - add $(VEC_SIZE * 3), %r8 - jl L(CopyVecSizeCase3) - vmovdqu %ymm4, (%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - vmovdqu %ymm5, VEC_SIZE(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) - sub $VEC_SIZE, %r8 - jb L(CopyVecSizeCase3) - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) + bsfl %ecx, %edx # ifdef USE_AS_STPCPY - lea (VEC_SIZE * 4)(%rdi), %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (VEC_SIZE * 4)(%rdi) + leaq (%rdi, %rdx), %rax +# elif !defined USE_AS_STRCAT + movq %rdi, %rax # endif - VZEROUPPER_RETURN - .p2align 4 -L(UnalignedFourVecSizeLeaveCase2): - xor %ecx, %ecx - vpcmpeqb %ymm4, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - add $(VEC_SIZE * 3), %r8 - jle L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif - vpcmpeqb %ymm5, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - vmovdqu %ymm4, (%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec5) -# else - jnz L(CopyVecSize) -# endif + /* vzeroupper early to avoid duplicating at each return. */ + COND_VZEROUPPER - vpcmpeqb %ymm6, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - vmovdqu %ymm5, VEC_SIZE(%rdi) - add $VEC_SIZE, %rcx - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) - test %edx, %edx -# ifndef USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec6) -# else - jnz L(CopyVecSize) -# endif + testw %cx, %cx + jz L(page_cross_copy_16_31) - vpcmpeqb %ymm7, %ymmZ, %ymmM - vpmovmskb %ymmM, %edx - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) - lea VEC_SIZE(%rdi, %rcx), %rdi - lea VEC_SIZE(%rsi, %rcx), %rsi - bsf %edx, %edx - cmp %r8d, %edx - jb L(CopyVecSizeExit) -L(StrncpyExit): - cmp $65, %r8d - je L(StrncpyExit65) - cmp $33, %r8d - jae L(StrncpyExit33_64) - cmp $17, %r8d - jae L(StrncpyExit17_32) - cmp $9, %r8d - jae L(StrncpyExit9_16) - cmp $5, %r8d - jae L(StrncpyExit5_8) - cmp $3, %r8d - jae L(StrncpyExit3_4) - cmp $1, %r8d - ja L(StrncpyExit2) - je L(StrncpyExit1) -# ifdef USE_AS_STPCPY - mov %rdi, %rax -# endif -# ifdef USE_AS_STRCAT - movb $0, (%rdi) -# endif - VZEROUPPER_RETURN - - .p2align 4 -L(ExitZero): -# ifndef USE_AS_STRCAT - mov %rdi, %rax -# endif - VZEROUPPER_RETURN + testb %cl, %cl + jz L(page_cross_copy_8_15) -# endif + testl $0x7, %cl + jz L(page_cross_copy_4_7) -# ifndef USE_AS_STRCAT -END (STRCPY) -# else -END (STRCAT) -# endif + testl %edx, %edx + jz L(page_cross_set_null_term) + movzwl (%rsi), %ecx + movw %cx, (%rdi) +L(page_cross_set_null_term): + movb $0, (%END_REG) + ret + + .p2align 4,, 4 +L(page_cross_copy_4_7): + movl (%rsi), %ecx + movl -3(%rsi, %rdx), %esi + movl %ecx, (%rdi) + movl %esi, -3(%END_REG) + ret + + .p2align 4,, 4 +L(page_cross_copy_8_15): + movq (%rsi), %rcx + movq -7(%rsi, %rdx), %rsi + movq %rcx, (%rdi) + movq %rsi, -7(%END_REG) + ret + + + .p2align 4,, 3 +L(page_cross_copy_16_31): + VMOVU (%rsi), %xmm0 + VMOVU -15(%rsi, %rdx), %xmm1 + VMOVU %xmm0, (%rdi) + VMOVU %xmm1, -15(%END_REG) + ret +# endif + +END(STRCPY) #endif diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S index 0dcea18dbb..7272deef2c 100644 --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -1,3 +1,3 @@ -#define USE_AS_STRNCAT -#define STRCAT __strncat_avx2_rtm -#include "strcat-avx2-rtm.S" +#define STRNCAT __strncat_avx2_rtm +#include "x86-avx-rtm-vecs.h" +#include "strncat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S index 52ecbca943..ffa58bd0de 100644 --- a/sysdeps/x86_64/multiarch/strncat-avx2.S +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S @@ -1,7 +1,419 @@ -#ifndef STRNCAT -# define STRNCAT __strncat_avx2 -#endif +/* strncat with AVX2 + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (3) + +# include <sysdep.h> + +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif + +# ifndef STRNCAT +# define STRNCAT __strncat_avx2 +# endif + +# ifdef USE_AS_WCSCPY +# define MOVCHAR movl +# define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 +# else +# define MOVCHAR movb +# define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 +# endif + +# include "strncpy-or-cat-overflow-def.h" + +# define PAGE_SIZE 4096 + +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) + + .section SECTION(.text), "ax", @progbits +ENTRY(STRNCAT) + /* Filter zero length strings and very long strings. Zero + length strings just return, very long strings are handled by + using the non-length variant {wcs|str}cat. */ + movq %rdi, %rax +# ifdef USE_AS_WCSCPY + leaq -1(%rdx), %rcx + shr $56, %rcx + jnz L(zero_len) + salq $2, %rdx +# else + test %rdx, %rdx + jl L(zero_len) +# endif + vpxor %VZERO_128, %VZERO_128, %VZERO_128 + +# include "strcat-strlen-avx2.h.S" + + movl %esi, %ecx + andl $(PAGE_SIZE - 1), %ecx + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx + ja L(page_cross) +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + tzcnt %ecx, %r8d + cmpq %r8, %rdx + jbe L(less_1x_vec) + + testl %ecx, %ecx + jz L(more_1x_vec) + + /* Hoist this to save code size. */ + + movl %r8d, %edx + +L(less_1x_vec): + COND_VZEROUPPER + + cmpl $16, %edx + jae L(copy_16_31) + cmpl $8, %edx + jae L(copy_8_15) + + +# ifdef USE_AS_WCSCPY + vmovd %VMM_128(0), (%rdi) + MOVCHAR $0, (%rdi, %rdx) + ret +# else + cmpl $4, %edx + jae L(copy_4_7) + + movzbl (%rsi), %ecx + cmpl $1, %edx + jbe L(set_null_term) + + /* NB: make this `vmovw` if support for AVX512-FP16 is added. + */ + movzwl 1(%rsi), %esi + movw %si, 1(%rdi) + + .p2align 4,, 1 +L(set_null_term): + movb %cl, (%rdi) + MOVCHAR $0, (%rdi, %rdx) + ret + + .p2align 4,, 11 +L(copy_4_7): + movl -(4)(%rsi, %rdx), %ecx + vmovd %xmm0, (%rdi) + movl %ecx, -(4)(%rdi, %rdx) + MOVCHAR $0, (%rdi, %rdx) + ret +# endif + + + .p2align 4,, 10 +L(copy_16_31): + VMOVU -(16)(%rsi, %rdx), %xmm1 + VMOVU %xmm0, (%rdi) + VMOVU %xmm1, -(16)(%rdi, %rdx) + MOVCHAR $0, (%rdi, %rdx) + ret + + .p2align 4,, 10 +L(copy_8_15): + movq -(8)(%rsi, %rdx), %rcx + vmovq %xmm0, (%rdi) + movq %rcx, -(8)(%rdi, %rdx) + MOVCHAR $0, (%rdi, %rdx) + ret + + .p2align 4,, 8 + .p2align 6,, 14 +L(more_1x_vec): + VMOVU %VMM(0), (%rdi) + + /* Align rsi (src) and just rdx/rdi (length/dst). */ + addq %rsi, %rdx + subq %rsi, %rdi + orq $(VEC_SIZE - 1), %rsi + incq %rsi + addq %rsi, %rdi +L(loop_last_4x_vec): + subq %rsi, %rdx + VMOVA 0(%rsi), %VMM(1) + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +L(last_2x_vec): + tzcnt %ecx, %ecx + cmpl %ecx, %edx + jbe L(ret_vec_x1_len) + + cmpl $VEC_SIZE, %ecx + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) + VMOVU %VMM(1), (%rdi) + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + addl $-VEC_SIZE, %edx + bzhil %edx, %ecx, %r8d + jz L(ret_vec_x2_len) +L(ret_vec_x2): + bsfl %ecx, %edx +L(ret_vec_x2_len): + VMOVU (%rsi, %rdx), %VMM(0) + MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx) + VMOVU %VMM(0), (%rdi, %rdx) +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + + .p2align 4,, 12 +L(ret_vec_x1_len): + movl %edx, %ecx +L(ret_vec_x1): + VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1) + MOVCHAR $0, (%rdi, %rcx) + VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx) + VZEROUPPER_RETURN + + .p2align 4,, 8 +L(last_4x_vec): + subq $-(VEC_SIZE * 4), %rsi + VMOVA 0(%rsi), %VMM(1) + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + subq $-(VEC_SIZE * 4), %rdi + addl $-(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + .p2align 4,, 8 +L(more_2x_vec): + /* L(ret_vec_x1) expects ecx to have position of first match so + test with bsf. */ + bsfl %ecx, %ecx + jnz L(ret_vec_x1) + + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2) + VMOVU %VMM(1), (%rdi) + + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x2) + -#define USE_AS_STRNCAT -#define STRCAT STRNCAT -#include "strcat-avx2.S" + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) + VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi) + + VPCMPEQ %VMM(3), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + /* Check if length is greater than 4x VEC. */ + cmpq $(VEC_SIZE * 4), %rdx + ja L(more_4x_vec) + + addl $(VEC_SIZE * -2), %edx + + tzcnt %ecx, %ecx + cmpl %ecx, %edx + jbe L(ret_vec_x3_len) + + cmpl $VEC_SIZE, %ecx + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi) + VPCMPEQ %VMM(4), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + addl $-VEC_SIZE, %edx + bzhil %edx, %ecx, %r8d + jz L(ret_vec_x4_len) +L(ret_vec_x4): + bsfl %ecx, %edx +L(ret_vec_x4_len): + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0) + MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx) + VZEROUPPER_RETURN + + .p2align 4,, 4 +L(ret_vec_x3_len): + movl %edx, %ecx +L(ret_vec_x3): + VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0) + MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx) + VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx) + VZEROUPPER_RETURN + + + .p2align 4,, 8 +L(more_4x_vec): + bsfl %ecx, %ecx + jnz L(ret_vec_x3) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) + VPCMPEQ %VMM(4), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(ret_vec_x4) + + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) + + + /* Recheck length before aligning. */ + cmpq $(VEC_SIZE * 8), %rdx + jbe L(last_4x_vec) + + /* Align rsi (src) and just rdx/rdi (length/dst). */ + addq %rsi, %rdx + subq %rsi, %rdi + subq $-(VEC_SIZE * 4), %rsi + andq $(VEC_SIZE * -4), %rsi + + /* Do first half of loop ahead of time so loop can just start by + storing. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %r8d + addq %rsi, %rdi + testl %r8d, %r8d + jnz L(loop_4x_done) + + /* Use r9 for end of region before handling last 4x VEC + specially. */ + leaq -(VEC_SIZE * 4)(%rdx), %r9 + + .p2align 4,, 11 +L(loop_4x_vec): + + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + subq $(VEC_SIZE * -4), %rsi + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) + + subq $(VEC_SIZE * -4), %rdi + cmpq %rsi, %r9 + jbe L(loop_last_4x_vec) + + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + + vpmovmskb %VMM(6), %r8d + + testl %r8d, %r8d + jz L(loop_4x_vec) + +L(loop_4x_done): + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + /* L(ret_vec_x1) expects ecx to have position of first match so + test with bsf. */ + bsfl %ecx, %ecx + jnz L(ret_vec_x1) + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + testl %ecx, %ecx + jnz L(ret_vec_x2) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + bsfl %ecx, %ecx + jnz L(ret_vec_x3) + + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + bsfl %r8d, %r8d + VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1) + VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8) + VZEROUPPER_RETURN + + + + .p2align 4,, 4 +L(page_cross): + movq %rsi, %r8 + andq $(VEC_SIZE * -1), %r8 + + VPCMPEQ (%r8), %VZERO, %VMM(6) + + vpmovmskb %VMM(6), %ecx + shrxl %esi, %ecx, %ecx + + subl %esi, %r8d + andl $(VEC_SIZE - 1), %r8d + cmpq %r8, %rdx + jb L(page_cross_small) + + /* Optimizing more aggressively for space as this is very cold + code. This saves 2x cache lines. */ + + /* This adds once to the later result which will get correct + copy bounds. NB: this can never zero-out a non-zero RCX as + to be in the page cross case rsi cannot be aligned and we + already right-shift rcx by the misalignment. */ + shll $CHAR_SIZE, %ecx + jz L(page_cross_continue) + bsfl %ecx, %ecx + rep movsb + VZEROUPPER_RETURN + +L(page_cross_small): + tzcntl %ecx, %ecx + jz L(page_cross_setz) + cmpl %edx, %ecx + cmova %edx, %ecx + rep movsb +L(page_cross_setz): + MOVCHAR $0, (%rdi) + VZEROUPPER_RETURN +L(zero_len): +# ifdef USE_AS_WCSCPY + test %rdx, %rdx +# endif + jnz OVERFLOW_STRCAT + ret + + +END(STRNCAT) +#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S index 79e7083299..d42ad88b3d 100644 --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -1,3 +1,3 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_avx2_rtm -#include "strcpy-avx2-rtm.S" +#define STRNCPY __strncpy_avx2_rtm +#include "x86-avx-rtm-vecs.h" +#include "strncpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S index ce634e94fa..e9afd8fbed 100644 --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S @@ -1,7 +1,735 @@ -#ifndef STRNCPY -# define STRNCPY __strncpy_avx2 -#endif +/* strncpy with AVX2 + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <isa-level.h> + +#if ISA_SHOULD_BUILD (3) + +# include <sysdep.h> + + +# ifndef VEC_SIZE +# include "x86-avx-vecs.h" +# endif + +# ifndef STRNCPY +# define STRNCPY __strncpy_avx2 +# endif + + +# ifdef USE_AS_WCSCPY +# define VPCMPEQ vpcmpeqd +# define VPMIN vpminud +# define CHAR_SIZE 4 +# else +# define VPCMPEQ vpcmpeqb +# define VPMIN vpminub +# define CHAR_SIZE 1 +# endif + +# include "strncpy-or-cat-overflow-def.h" + +# define PAGE_SIZE 4096 + +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) + + + .section SECTION(.text), "ax", @progbits +ENTRY(STRNCPY) + /* Filter zero length strings and very long strings. Zero + length strings just return, very long strings are handled by + just running rep stos{b|l} to zero set (which will almost + certainly segfault), if that succeeds then just calling + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */ +# ifdef USE_AS_WCSCPY + decq %rdx + movq %rdx, %rax + /* 56 is end of max supported address space. */ + shr $56, %rax + jnz L(zero_len) + salq $2, %rdx +# else + decq %rdx + /* `dec` can macrofuse with `jl`. If the flag needs to become + `jb` replace `dec` with `sub`. */ + jl L(zero_len) +# endif + + vpxor %VZERO_128, %VZERO_128, %VZERO_128 + movl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(page_cross) + +L(page_cross_continue): + VMOVU (%rsi), %VMM(0) + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + /* If no STPCPY just save end ahead of time. */ +# ifndef USE_AS_STPCPY + movq %rdi, %rax +# elif defined USE_AS_WCSCPY + /* Clear dependency as nearly all return code for wcpncpy uses + `setc %al`. */ + xorl %eax, %eax +# endif + + cmpq $(VEC_SIZE - CHAR_SIZE), %rdx + /* `jb` because length rdx is now length - CHAR_SIZE. */ + jbe L(less_1x_vec) + + /* This may overset but thats fine because we still need to zero + fill. */ + VMOVU %VMM(0), (%rdi) + + testl %ecx, %ecx + jnz L(zfill) + + /* Align. */ + addq %rsi, %rdx + subq %rsi, %rdi + orq $(VEC_SIZE - 1), %rsi + incq %rsi +L(last_4x_vec): + addq %rsi, %rdi +L(loop_last_4x_vec): + subq %rsi, %rdx + + + VMOVA 0(%rsi), %VMM(1) + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + cmpq $(VEC_SIZE * 2), %rdx + jae L(more_2x_vec) + + cmpl $(VEC_SIZE), %edx + jb L(ret_vec_x1_len) + + testl %ecx, %ecx + jnz L(ret_vec_x1) + + VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6) + VMOVU %VMM(1), (%rdi) + vpmovmskb %VMM(6), %ecx + shlq $VEC_SIZE, %rcx +L(ret_vec_x1_len): + tzcntq %rcx, %rcx + cmpl %ecx, %edx + jbe L(ret_vec_x1_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) +L(ret_vec_x1_len_no_zfill_mov): + movl %ecx, %edx +# ifdef USE_AS_STPCPY + /* clear flags. */ + xorl %ecx, %ecx +# endif +L(ret_vec_x1_len_no_zfill): + VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) + VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + setc %al + addq %rdx, %rdi + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# endif +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4,, 6 +L(ret_vec_x1): + bsfl %ecx, %ecx + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) + subl %ecx, %edx + /* Check if we need to reload/store. */ + cmpl $VEC_SIZE, %edx + jb L(ret_vec_x1_len_no_zfill_mov) + /* Otherwise safe to just store directly. */ + VMOVU %VMM(1), (%rdi) + VMOVU %VZERO, (%rdi, %rcx) +# ifdef USE_AS_STPCPY + leaq (%rdi, %rcx), %rax +# endif + VZEROUPPER_RETURN + + .p2align 4,, 12 +L(more_2x_vec): + VMOVU %VMM(1), (%rdi) + testl %ecx, %ecx + /* Must fill at least 2x VEC. */ + jnz L(zfill_vec1) + + VMOVA VEC_SIZE(%rsi), %VMM(2) + VMOVU %VMM(2), VEC_SIZE(%rdi) + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + /* Must fill at least 1x VEC. */ + jnz L(zfill_vec2) + + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3) + VPCMPEQ %VMM(3), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + + /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len - + CHAR_SIZE. */ + cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx + ja L(more_4x_vec) + + subl $(VEC_SIZE * 3), %edx + jb L(ret_vec_x3_len) + + testl %ecx, %ecx + jnz L(ret_vec_x3) + + VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6) + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) + vpmovmskb %VMM(6), %ecx + tzcntl %ecx, %ecx + cmpl %ecx, %edx + jbe L(ret_vec_x4_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) + movl %ecx, %edx +L(ret_vec_x4_len_no_zfill): + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) + VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + setc %al + addq %rdx, %rdi + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +# else + leal (VEC_SIZE * 3 + 0)(%edx), %eax + adcq %rdi, %rax +# endif +# endif + VZEROUPPER_RETURN + + +L(ret_vec_x3_len): + addl $(VEC_SIZE * 1), %edx + tzcntl %ecx, %ecx + cmpl %ecx, %edx + jbe L(ret_vec_x3_len_no_zfill) + /* Fall through (expectation) is copy len < buffer len. */ + VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) +L(ret_vec_x3_len_no_zfill_mov): + movl %ecx, %edx +# ifdef USE_AS_STPCPY + /* clear flags. */ + xorl %ecx, %ecx +# endif + .p2align 4,, 4 +L(ret_vec_x3_len_no_zfill): + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1) + VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx) +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + setc %al + addq %rdx, %rdi + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +# else + leal (VEC_SIZE * 2 + 0)(%rdx), %eax + adcq %rdi, %rax +# endif +# endif + VZEROUPPER_RETURN + + + .p2align 4,, 8 +L(ret_vec_x3): + bsfl %ecx, %ecx + VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx) + subl %ecx, %edx + jl L(ret_vec_x3_len_no_zfill_mov) + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx) +# ifdef USE_AS_STPCPY + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax +# endif + VZEROUPPER_RETURN + + .p2align 4,, 8 +L(more_4x_vec): + + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi) + testl %ecx, %ecx + jnz L(zfill_vec3) + + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4) + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi) + VPCMPEQ %VMM(4), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(zfill_vec4) + + movq %rdx, %rcx + addq %rsi, %rdx + subq %rsi, %rdi + subq $-(VEC_SIZE * 4), %rsi + /* Recheck length before aligning. */ + cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx + jbe L(last_4x_vec) + + andq $(VEC_SIZE * -4), %rsi + + /* Do first half of loop ahead of time so loop can just start by + storing. */ + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %r8d + addq %rsi, %rdi + testl %r8d, %r8d + jnz L(loop_4x_done) + + /* Use r9 as end register. */ + leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9 -#define USE_AS_STRNCPY -#define STRCPY STRNCPY -#include "strcpy-avx2.S" + .p2align 4,, 11 +L(loop_4x_vec): + + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + subq $(VEC_SIZE * -4), %rsi + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) + + subq $(VEC_SIZE * -4), %rdi + cmpq %rsi, %r9 + jbe L(loop_last_4x_vec) + + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0) + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1) + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2) + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3) + + VPMIN %VMM(0), %VMM(1), %VMM(4) + VPMIN %VMM(2), %VMM(3), %VMM(6) + VPMIN %VMM(4), %VMM(6), %VMM(6) + VPCMPEQ %VMM(6), %VZERO, %VMM(6) + + vpmovmskb %VMM(6), %r8d + + testl %r8d, %r8d + jz L(loop_4x_vec) + +L(loop_4x_done): + subq %rsi, %rdx + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi) + VPCMPEQ %VMM(0), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(zfill_vec1) + + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi) + VPCMPEQ %VMM(1), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(zfill_vec2) + + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi) + VPCMPEQ %VMM(2), %VZERO, %VMM(6) + vpmovmskb %VMM(6), %ecx + testl %ecx, %ecx + jnz L(zfill_vec3) + + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi) + movl %r8d, %ecx + + // Zfill more.... + + .p2align 4,, 4 +L(zfill_vec4): + addq $(VEC_SIZE * 2), %rdi + subq $(VEC_SIZE * 2), %rdx +L(zfill_vec2): + shlq $VEC_SIZE, %rcx +L(zfill): + bsfq %rcx, %rcx + subq %rcx, %rdx + addq %rcx, %rdi +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_from_page_cross): + cmpq $VEC_SIZE, %rdx + jb L(zfill_less_vec_vzeroupper) + +L(zfill_more_1x_vec): + VMOVU %VZERO, CHAR_SIZE(%rdi) + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) + cmpq $(VEC_SIZE * 2), %rdx + jae L(zfill_more_2x_vec) +L(zfill_done0): + VZEROUPPER_RETURN + + .p2align 4,, 8 +L(zfill_vec3): + addq $(VEC_SIZE * 2), %rdi + subq $(VEC_SIZE * 2), %rdx + .p2align 4,, 2 +L(zfill_vec1): + bsfl %ecx, %ecx + addq %rcx, %rdi + subq %rcx, %rdx +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + /* zfill from vec1/vec3 must have to set at least 2x VECS. */ + + VMOVU %VZERO, CHAR_SIZE(%rdi) + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx) + cmpq $(VEC_SIZE * 2), %rdx + jb L(zfill_done0) +L(zfill_more_2x_vec): + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi) + subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx + jbe L(zfill_done) + + addq %rdi, %rdx + VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi) + VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi) + + + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx) + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx) + + subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi + cmpq %rdi, %rdx + jbe L(zfill_done) + + andq $-(VEC_SIZE), %rdi + .p2align 4,, 12 +L(zfill_loop_4x_vec): + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi) + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpq %rdi, %rdx + ja L(zfill_loop_4x_vec) +L(zfill_done): + VZEROUPPER_RETURN + + + .p2align 4,, 8 +L(copy_1x): + VMOVU %VMM(0), (%rdi) + testl %ecx, %ecx + jz L(ret_32_32) +L(zfill_less_vec): + bsfl %ecx, %ecx +L(zfill_less_vec_no_bsf): + subq %rcx, %rdx + addq %rcx, %rdi +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_less_vec_vzeroupper): + COND_VZEROUPPER + /* We are taking advantage of the fact that to be here we must + be writing null-term as (%rdi, %rcx) we have a byte of lee- + way for overwriting. */ + cmpl $16, %edx + jb L(zfill_less_16) + VMOVU %VZERO_128, (%rdi) + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx) + ret +# ifdef USE_AS_STPCPY +L(ret_32_32): + leaq CHAR_SIZE(%rdi, %rdx), %rax + VZEROUPPER_RETURN +# endif + + .p2align 4,, 4 +L(copy_16_31): + /* Overfill to avoid branches. */ + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx) + cmpl %ecx, %edx + ja L(zfill_less_vec_no_bsf) +# ifndef USE_AS_STPCPY +L(ret_32_32): +# else +# ifdef USE_AS_WCSCPY + setc %al + addq %rdx, %rdi + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# endif + VZEROUPPER_RETURN + + .p2align 4,, 4 +L(copy_8_15): + /* Overfill to avoid branches. */ + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi + vmovq %xmm0, (%rdi) + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx) + cmpl %ecx, %edx + jbe L(ret_8_15) + subq %rcx, %rdx + addq %rcx, %rdi +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + .p2align 4,, 8 +L(zfill_less_16): + xorl %ecx, %ecx + cmpl $8, %edx + jb L(zfill_less_8) + movq %rcx, (%rdi) + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx) +# ifndef USE_AS_STPCPY +L(ret_8_15): +# endif + ret + + + .p2align 4,, 8 +L(less_1x_vec): + /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many + buffer sizes are aligned conventially. */ + je L(copy_1x) + + tzcntl %ecx, %ecx + cmpl $16, %edx + jae L(copy_16_31) + + COND_VZEROUPPER + cmpl $8, %edx + jae L(copy_8_15) +# ifdef USE_AS_WCSCPY + testl %ecx, %ecx + jz L(zfill_less_8_set_ret) + + movl (%rsi, %rdx), %esi + vmovd %xmm0, (%rdi) + movl %esi, (%rdi, %rdx) + +# ifdef USE_AS_STPCPY + cmpl %ecx, %edx +L(ret_8_15): + setc %al + addq %rdx, %rdi + leaq (%rdi, %rax, CHAR_SIZE), %rax +# endif + ret +L(zfill_less_8_set_ret): + xorl %ecx, %ecx +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif +L(zfill_less_8): + movl %ecx, (%rdi) + movl %ecx, (%rdi, %rdx) + ret + +# else + cmpl $3, %edx + jb L(copy_0_3) + /* Overfill to avoid branches. */ + movl -3(%rsi, %rdx), %esi + vmovd %xmm0, (%rdi) + movl %esi, -3(%rdi, %rdx) + cmpl %ecx, %edx + jbe L(ret_4_7) + subq %rcx, %rdx + addq %rcx, %rdi +# ifdef USE_AS_STPCPY + movq %rdi, %rax +# endif + xorl %ecx, %ecx + .p2align 4,, 8 +L(zfill_less_8): + cmpl $3, %edx + jb L(zfill_less_3) + movl %ecx, (%rdi) + movl %ecx, -3(%rdi, %rdx) +# ifdef USE_AS_STPCPY + ret +# endif + +L(ret_4_7): +# ifdef USE_AS_STPCPY +L(ret_8_15): + movl %edx, %eax + adcq %rdi, %rax +# endif + ret + + .p2align 4,, 4 +L(zfill_less_3): + testl %edx, %edx + jz L(zfill_1) + movw %cx, (%rdi) +L(zfill_1): + movb %cl, (%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_0_3): + vmovd %xmm0, %r8d + testl %edx, %edx + jz L(copy_1) + movw %r8w, (%rdi) + cmpl %ecx, %edx + ja L(zfill_from_1) + movzbl (%rsi, %rdx), %r8d +# ifdef USE_AS_STPCPY + movl %edx, %eax + adcq %rdi, %rax + movb %r8b, (%rdi, %rdx) + ret +# endif + +L(copy_1): +# ifdef USE_AS_STPCPY + movl %edx, %eax + cmpl %ecx, %edx + adcq %rdi, %rax +# endif +# ifdef USE_AS_WCSCPY + vmovd %xmm0, (%rdi) +# else + movb %r8b, (%rdi, %rdx) +# endif + ret +# endif + + .p2align 4,, 2 +L(zero_len): + movq %rdi, %rax + ret +# ifndef USE_AS_WCSCPY + .p2align 4,, 8 +L(zfill_from_1): +# ifdef USE_AS_STPCPY + leaq (%rdi, %rcx), %rax +# endif + movw $0, -1(%rdi, %rdx) + ret +# endif + + .p2align 4,, 4 + .p2align 6,, 8 +L(page_cross): + movq %rsi, %rax + andq $(VEC_SIZE * -1), %rax + + VPCMPEQ (%rax), %VZERO, %VMM(6) + + vpmovmskb %VMM(6), %ecx + shrxl %esi, %ecx, %ecx + + subl %esi, %eax + andl $(VEC_SIZE - 1), %eax + cmpq %rax, %rdx + jb L(page_cross_small) + /* Optimizing more aggressively for space as this is very cold + code. This saves 2x cache lines. */ + + /* If rcx is non-zero then continue. */ + shl $CHAR_SIZE, %ecx + jz L(page_cross_continue) + bsf %ecx, %ecx + + subq %rcx, %rdx +# ifdef USE_AS_STPCPY + leaq -CHAR_SIZE(%rdi, %rcx), %rax +# else + movq %rdi, %rax +# endif + + rep movsb +# ifdef USE_AS_WCSCPY + movl $0, (%rdi) +# else + movb $0, (%rdi) +# endif + jmp L(zfill_from_page_cross) + +L(page_cross_small): + tzcntl %ecx, %ecx + xorl %eax, %eax + cmpl %ecx, %edx + jbe L(page_cross_copy_only) + + /* Do a zfill of the tail before copying. */ + movq %rdi, %r9 + movl %ecx, %r8d + + subl %ecx, %edx + leaq CHAR_SIZE(%rdi, %rcx), %rdi + movl %edx, %ecx + rep stosb + movq %r9, %rdi + movl %r8d, %edx +L(page_cross_copy_only): + leal CHAR_SIZE(%rdx), %ecx +# ifdef USE_AS_STPCPY +# ifdef USE_AS_WCSCPY + setc %al + addq %rdi, %rdx + leaq (%rdx, %rax, CHAR_SIZE), %rax +# else + movl %edx, %eax + adcq %rdi, %rax +# endif +# else + movq %rdi, %rax +# endif + rep movsb + ret + + +L(best_effort_strncpy): + movq %rdx, %rcx + xorl %eax, %eax + movq %rdi, %r8 + /* The length is >= 2^63. We very much so expect to segfault at + rep stos. If that doesn't happen then just strcpy to finish. + */ +# ifdef USE_AS_WCSCPY + rep stosl +# else + rep stosb +# endif + movq %r8, %rdi + jmp OVERFLOW_STRCPY +END(STRNCPY) +#endif diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h index dca1089060..275af7560a 100644 --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h @@ -27,7 +27,8 @@ #define VEC_SIZE 32 #include "x86-vec-macros.h" -#define USE_WITH_AVX 1 +#define USE_WITH_AVX2 1 + #define SECTION(p) p##.avx /* 4-byte mov instructions with AVX2. */