Message ID | CAMXFM3vHyJXK=AWcgBSUWHE7HTHSvLER6DVx2joGdTBFEC528Q@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Fri, Dec 11, 2015 at 5:26 AM, Andrew Senkevich <andrew.n.senkevich@gmail.com> wrote: > 2015-12-10 22:34 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >> On Thu, Dec 10, 2015 at 10:28 AM, Andrew Senkevich >> <andrew.n.senkevich@gmail.com> wrote: >>>>> END (MEMSET) >>>>> +libc_hidden_def (__memset_avx2) >>>> >>>> Why is this change needed? If it is needed, please submit >>>> a separate patch. >>> >>> We can avoid this change if hide implementation, test and IFUNC branch >>> under HAVE_AVX512_ASM_SUPPORT. >>> >>>> Should __memset_chk_avx512 also be provided? >>> >>> It will be the same as AVX2 version, is it really needed? >> >> __memset_chk_avx2 calls __memset_avx2. Don't you want >> __memset_chk to call __memset_avx512, instead of __memset_avx2, >> on KNL? > > Oh yes, surely we need it. > > Is patch below Ok for trunk? > > 2015-12-11 Andrew Senkevich <andrew.senkevich@intel.com> > > * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file. > * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests. > * sysdeps/x86_64/multiarch/memset-avx512.S: New file. > * sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch. > * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. > > diff --git a/sysdeps/x86_64/multiarch/memset.S > b/sysdeps/x86_64/multiarch/memset.S > index dbc00d2..9f16b7e 100644 > --- a/sysdeps/x86_64/multiarch/memset.S > +++ b/sysdeps/x86_64/multiarch/memset.S > @@ -30,6 +30,13 @@ ENTRY(memset) > HAS_ARCH_FEATURE (AVX2_Usable) > jz 2f > leaq __memset_avx2(%rip), %rax > +#ifdef HAVE_AVX512_ASM_SUPPORT > + HAS_ARCH_FEATURE (AVX512DQ_Usable) > + jnz 2f > + HAS_ARCH_FEATURE (AVX512F_Usable) > + jz 2f > + leaq __memset_avx512(%rip), %rax > +#endif > 2: ret > END(memset) > #endif > diff --git a/sysdeps/x86_64/multiarch/memset_chk.S > b/sysdeps/x86_64/multiarch/memset_chk.S > index e2abb15..5115dfb 100644 > --- a/sysdeps/x86_64/multiarch/memset_chk.S > +++ b/sysdeps/x86_64/multiarch/memset_chk.S > @@ -30,6 +30,13 @@ ENTRY(__memset_chk) > HAS_ARCH_FEATURE (AVX2_Usable) > jz 2f > leaq __memset_chk_avx2(%rip), %rax > +#ifdef HAVE_AVX512_ASM_SUPPORT > + HAS_ARCH_FEATURE (AVX512DQ_Usable) > + jnz 2f > + HAS_ARCH_FEATURE (AVX512F_Usable) > + jz 2f > + leaq __memset_chk_avx512(%rip), %rax > +#endif > 2: ret > END(__memset_chk) > What is the purpose of checking AVX512DQ_Usable? To avoid using it on SKX? Is __memset_avx512 slower than __memset_avx2 on SKX?
2015-12-11 16:39 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: > On Fri, Dec 11, 2015 at 5:26 AM, Andrew Senkevich > <andrew.n.senkevich@gmail.com> wrote: >> 2015-12-10 22:34 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >>> On Thu, Dec 10, 2015 at 10:28 AM, Andrew Senkevich >>> <andrew.n.senkevich@gmail.com> wrote: >>>>>> END (MEMSET) >>>>>> +libc_hidden_def (__memset_avx2) >>>>> >>>>> Why is this change needed? If it is needed, please submit >>>>> a separate patch. >>>> >>>> We can avoid this change if hide implementation, test and IFUNC branch >>>> under HAVE_AVX512_ASM_SUPPORT. >>>> >>>>> Should __memset_chk_avx512 also be provided? >>>> >>>> It will be the same as AVX2 version, is it really needed? >>> >>> __memset_chk_avx2 calls __memset_avx2. Don't you want >>> __memset_chk to call __memset_avx512, instead of __memset_avx2, >>> on KNL? >> >> Oh yes, surely we need it. >> >> Is patch below Ok for trunk? >> >> 2015-12-11 Andrew Senkevich <andrew.senkevich@intel.com> >> >> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file. >> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests. >> * sysdeps/x86_64/multiarch/memset-avx512.S: New file. >> * sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch. >> * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. >> > >> diff --git a/sysdeps/x86_64/multiarch/memset.S >> b/sysdeps/x86_64/multiarch/memset.S >> index dbc00d2..9f16b7e 100644 >> --- a/sysdeps/x86_64/multiarch/memset.S >> +++ b/sysdeps/x86_64/multiarch/memset.S >> @@ -30,6 +30,13 @@ ENTRY(memset) >> HAS_ARCH_FEATURE (AVX2_Usable) >> jz 2f >> leaq __memset_avx2(%rip), %rax >> +#ifdef HAVE_AVX512_ASM_SUPPORT >> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >> + jnz 2f >> + HAS_ARCH_FEATURE (AVX512F_Usable) >> + jz 2f >> + leaq __memset_avx512(%rip), %rax >> +#endif >> 2: ret >> END(memset) >> #endif >> diff --git a/sysdeps/x86_64/multiarch/memset_chk.S >> b/sysdeps/x86_64/multiarch/memset_chk.S >> index e2abb15..5115dfb 100644 >> --- a/sysdeps/x86_64/multiarch/memset_chk.S >> +++ b/sysdeps/x86_64/multiarch/memset_chk.S >> @@ -30,6 +30,13 @@ ENTRY(__memset_chk) >> HAS_ARCH_FEATURE (AVX2_Usable) >> jz 2f >> leaq __memset_chk_avx2(%rip), %rax >> +#ifdef HAVE_AVX512_ASM_SUPPORT >> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >> + jnz 2f >> + HAS_ARCH_FEATURE (AVX512F_Usable) >> + jz 2f >> + leaq __memset_chk_avx512(%rip), %rax >> +#endif >> 2: ret >> END(__memset_chk) >> > > What is the purpose of checking AVX512DQ_Usable? To > avoid using it on SKX? Is __memset_avx512 slower than > __memset_avx2 on SKX? This is implementation only for KNL because SKX may require vzeroupper on AVX/SEE transition paths. This version became slower with vzeroupper on that paths, so limited to KNL hardware. -- WBR, Andrew
On Fri, Dec 11, 2015 at 5:45 AM, Andrew Senkevich <andrew.n.senkevich@gmail.com> wrote: > 2015-12-11 16:39 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >> On Fri, Dec 11, 2015 at 5:26 AM, Andrew Senkevich >> <andrew.n.senkevich@gmail.com> wrote: >>> 2015-12-10 22:34 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >>>> On Thu, Dec 10, 2015 at 10:28 AM, Andrew Senkevich >>>> <andrew.n.senkevich@gmail.com> wrote: >>>>>>> END (MEMSET) >>>>>>> +libc_hidden_def (__memset_avx2) >>>>>> >>>>>> Why is this change needed? If it is needed, please submit >>>>>> a separate patch. >>>>> >>>>> We can avoid this change if hide implementation, test and IFUNC branch >>>>> under HAVE_AVX512_ASM_SUPPORT. >>>>> >>>>>> Should __memset_chk_avx512 also be provided? >>>>> >>>>> It will be the same as AVX2 version, is it really needed? >>>> >>>> __memset_chk_avx2 calls __memset_avx2. Don't you want >>>> __memset_chk to call __memset_avx512, instead of __memset_avx2, >>>> on KNL? >>> >>> Oh yes, surely we need it. >>> >>> Is patch below Ok for trunk? >>> >>> 2015-12-11 Andrew Senkevich <andrew.senkevich@intel.com> >>> >>> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file. >>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests. >>> * sysdeps/x86_64/multiarch/memset-avx512.S: New file. >>> * sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch. >>> * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. >>> >> >>> diff --git a/sysdeps/x86_64/multiarch/memset.S >>> b/sysdeps/x86_64/multiarch/memset.S >>> index dbc00d2..9f16b7e 100644 >>> --- a/sysdeps/x86_64/multiarch/memset.S >>> +++ b/sysdeps/x86_64/multiarch/memset.S >>> @@ -30,6 +30,13 @@ ENTRY(memset) >>> HAS_ARCH_FEATURE (AVX2_Usable) >>> jz 2f >>> leaq __memset_avx2(%rip), %rax >>> +#ifdef HAVE_AVX512_ASM_SUPPORT >>> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >>> + jnz 2f >>> + HAS_ARCH_FEATURE (AVX512F_Usable) >>> + jz 2f >>> + leaq __memset_avx512(%rip), %rax >>> +#endif >>> 2: ret >>> END(memset) >>> #endif >>> diff --git a/sysdeps/x86_64/multiarch/memset_chk.S >>> b/sysdeps/x86_64/multiarch/memset_chk.S >>> index e2abb15..5115dfb 100644 >>> --- a/sysdeps/x86_64/multiarch/memset_chk.S >>> +++ b/sysdeps/x86_64/multiarch/memset_chk.S >>> @@ -30,6 +30,13 @@ ENTRY(__memset_chk) >>> HAS_ARCH_FEATURE (AVX2_Usable) >>> jz 2f >>> leaq __memset_chk_avx2(%rip), %rax >>> +#ifdef HAVE_AVX512_ASM_SUPPORT >>> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >>> + jnz 2f >>> + HAS_ARCH_FEATURE (AVX512F_Usable) >>> + jz 2f >>> + leaq __memset_chk_avx512(%rip), %rax >>> +#endif >>> 2: ret >>> END(__memset_chk) >>> >> >> What is the purpose of checking AVX512DQ_Usable? To >> avoid using it on SKX? Is __memset_avx512 slower than >> __memset_avx2 on SKX? > > This is implementation only for KNL because SKX may require vzeroupper > on AVX/SEE transition paths. > This version became slower with vzeroupper on that paths, so limited > to KNL hardware. > Please make following changes: 1. Change _avx512 to _avx512_no_vzeroupper. 2. Add a feature, Prefer_No_VZEROUPPER, to cpu-features.h, and set it for KNL. 3. Check Prefer_No_VZEROUPPER instead of AVX512DQ_Usable, 4. Don't check AVX512DQ_Usable nor Prefer_No_VZEROUPPER in ifunc-impl-list.c.
On Fri, Dec 11, 2015 at 5:53 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Fri, Dec 11, 2015 at 5:45 AM, Andrew Senkevich > <andrew.n.senkevich@gmail.com> wrote: >> 2015-12-11 16:39 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >>> On Fri, Dec 11, 2015 at 5:26 AM, Andrew Senkevich >>> <andrew.n.senkevich@gmail.com> wrote: >>>> 2015-12-10 22:34 GMT+03:00 H.J. Lu <hjl.tools@gmail.com>: >>>>> On Thu, Dec 10, 2015 at 10:28 AM, Andrew Senkevich >>>>> <andrew.n.senkevich@gmail.com> wrote: >>>>>>>> END (MEMSET) >>>>>>>> +libc_hidden_def (__memset_avx2) >>>>>>> >>>>>>> Why is this change needed? If it is needed, please submit >>>>>>> a separate patch. >>>>>> >>>>>> We can avoid this change if hide implementation, test and IFUNC branch >>>>>> under HAVE_AVX512_ASM_SUPPORT. >>>>>> >>>>>>> Should __memset_chk_avx512 also be provided? >>>>>> >>>>>> It will be the same as AVX2 version, is it really needed? >>>>> >>>>> __memset_chk_avx2 calls __memset_avx2. Don't you want >>>>> __memset_chk to call __memset_avx512, instead of __memset_avx2, >>>>> on KNL? >>>> >>>> Oh yes, surely we need it. >>>> >>>> Is patch below Ok for trunk? >>>> >>>> 2015-12-11 Andrew Senkevich <andrew.senkevich@intel.com> >>>> >>>> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Added new file. >>>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests. >>>> * sysdeps/x86_64/multiarch/memset-avx512.S: New file. >>>> * sysdeps/x86_64/multiarch/memset.S: Added new IFUNC branch. >>>> * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. >>>> >>> >>>> diff --git a/sysdeps/x86_64/multiarch/memset.S >>>> b/sysdeps/x86_64/multiarch/memset.S >>>> index dbc00d2..9f16b7e 100644 >>>> --- a/sysdeps/x86_64/multiarch/memset.S >>>> +++ b/sysdeps/x86_64/multiarch/memset.S >>>> @@ -30,6 +30,13 @@ ENTRY(memset) >>>> HAS_ARCH_FEATURE (AVX2_Usable) >>>> jz 2f >>>> leaq __memset_avx2(%rip), %rax >>>> +#ifdef HAVE_AVX512_ASM_SUPPORT >>>> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >>>> + jnz 2f >>>> + HAS_ARCH_FEATURE (AVX512F_Usable) >>>> + jz 2f >>>> + leaq __memset_avx512(%rip), %rax >>>> +#endif >>>> 2: ret >>>> END(memset) >>>> #endif >>>> diff --git a/sysdeps/x86_64/multiarch/memset_chk.S >>>> b/sysdeps/x86_64/multiarch/memset_chk.S >>>> index e2abb15..5115dfb 100644 >>>> --- a/sysdeps/x86_64/multiarch/memset_chk.S >>>> +++ b/sysdeps/x86_64/multiarch/memset_chk.S >>>> @@ -30,6 +30,13 @@ ENTRY(__memset_chk) >>>> HAS_ARCH_FEATURE (AVX2_Usable) >>>> jz 2f >>>> leaq __memset_chk_avx2(%rip), %rax >>>> +#ifdef HAVE_AVX512_ASM_SUPPORT >>>> + HAS_ARCH_FEATURE (AVX512DQ_Usable) >>>> + jnz 2f >>>> + HAS_ARCH_FEATURE (AVX512F_Usable) >>>> + jz 2f >>>> + leaq __memset_chk_avx512(%rip), %rax >>>> +#endif >>>> 2: ret >>>> END(__memset_chk) >>>> >>> >>> What is the purpose of checking AVX512DQ_Usable? To >>> avoid using it on SKX? Is __memset_avx512 slower than >>> __memset_avx2 on SKX? >> >> This is implementation only for KNL because SKX may require vzeroupper >> on AVX/SEE transition paths. >> This version became slower with vzeroupper on that paths, so limited >> to KNL hardware. >> > > Please make following changes: > > 1. Change _avx512 to _avx512_no_vzeroupper. > 2. Add a feature, Prefer_No_VZEROUPPER, to cpu-features.h, and set > it for KNL. > 3. Check Prefer_No_VZEROUPPER instead of AVX512DQ_Usable, > 4. Don't check AVX512DQ_Usable nor Prefer_No_VZEROUPPER in > ifunc-impl-list.c. > I submitted a patch to enable SLM optimization for KNL: https://sourceware.org/ml/libc-alpha/2015-12/msg00221.html It is on hjl/32bit/master branch. Please rebase your patch against mine since it adds KNL optimization.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index bb811c2..5bb859e 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -18,7 +18,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ - strcspn-c strpbrk-c strspn-c varshift memset-avx2 + strcspn-c strpbrk-c strspn-c varshift memset-avx2 memset-avx512 CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 5c0c219..ef3e64a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -20,6 +20,7 @@ #include <string.h> #include <wchar.h> #include <ifunc-impl-list.h> +#include <sysdep.h> #include "init-arch.h" /* Maximum number of IFUNC implementations. */ @@ -76,14 +77,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memset_chk_sse2) IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_chk_avx2)) + __memset_chk_avx2) +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable) + && !HAS_ARCH_FEATURE (AVX512DQ_Usable), + __memset_chk_avx512) +#endif + ) /* Support sysdeps/x86_64/multiarch/memset.S. */ IFUNC_IMPL (i, name, memset, IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2) IFUNC_IMPL_ADD (array, i, memset, HAS_ARCH_FEATURE (AVX2_Usable), - __memset_avx2)) + __memset_avx2) +#ifdef HAVE_AVX512_ASM_SUPPORT + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable) + && !HAS_ARCH_FEATURE (AVX512DQ_Usable), + __memset_avx512) +#endif + ) /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ IFUNC_IMPL (i, name, stpncpy, diff --git a/sysdeps/x86_64/multiarch/memset-avx512.S b/sysdeps/x86_64/multiarch/memset-avx512.S new file mode 100644 index 0000000..490fd72 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx512.S @@ -0,0 +1,194 @@ +/* memset optimized with AVX512 for KNL hardware. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) + +#include "asm-syntax.h" +#ifndef MEMSET +# define MEMSET __memset_avx512 +# define MEMSET_CHK __memset_chk_avx512 +#endif + + .section .text,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi + mov %rdi, %rax + vpshufb %xmm0, %xmm1, %xmm0 + cmp $16, %rdx + jb L(less_16bytes) + cmp $512, %rdx + vbroadcastss %xmm0, %zmm2 + ja L(512bytesormore) + cmp $256, %rdx + jb L(less_256bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm2, -0x20(%rsi) + ret + +L(less_32bytes): + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, -0x10(%rsi) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + vmovq %xmm0, (%rdi) + vmovq %xmm0, -0x08(%rsi) + ret + +L(less_8bytes): + vmovd %xmm0, %ecx + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%rsi) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): + mov __x86_shared_cache_size_half(%rip), %rcx + cmp %rcx, %rdx + ja L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, 0x100(%rdi) + vmovups %zmm2, 0x140(%rdi) + vmovups %zmm2, 0x180(%rdi) + vmovups %zmm2, 0x1C0(%rdi) + vmovups %zmm2, -0x200(%rsi) + vmovups %zmm2, -0x1C0(%rsi) + vmovups %zmm2, -0x180(%rsi) + vmovups %zmm2, -0x140(%rsi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +/* Align on 64 and loop with aligned stores. */ +L(1024bytesormore): + sub $0x100, %rsi + vmovups %zmm2, (%rax) + and $-0x40, %rdi + add $0x40, %rdi + +L(gobble_256bytes_loop): + vmovaps %zmm2, (%rdi) + vmovaps %zmm2, 0x40(%rdi) + vmovaps %zmm2, 0x80(%rdi) + vmovaps %zmm2, 0xC0(%rdi) + add $0x100, %rdi + cmp %rsi, %rdi + jb L(gobble_256bytes_loop) + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + ret + +/* Align on 128 and loop with non-temporal stores. */ +L(preloop_large): + and $-0x80, %rdi + add $0x80, %rdi + vmovups %zmm2, (%rax) + vmovups %zmm2, 0x40(%rax) + sub $0x200, %rsi + +L(gobble_512bytes_nt_loop): + vmovntdq %zmm2, (%rdi) + vmovntdq %zmm2, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm2, 0xC0(%rdi) + vmovntdq %zmm2, 0x100(%rdi) + vmovntdq %zmm2, 0x140(%rdi) + vmovntdq %zmm2, 0x180(%rdi) + vmovntdq %zmm2, 0x1C0(%rdi) + add $0x200, %rdi + cmp %rsi, %rdi + jb L(gobble_512bytes_nt_loop) + sfence + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + vmovups %zmm2, 0x100(%rsi) + vmovups %zmm2, 0x140(%rsi) + vmovups %zmm2, 0x180(%rsi) + vmovups %zmm2, 0x1C0(%rsi) + ret +END (MEMSET) +#endif diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S index dbc00d2..9f16b7e 100644 --- a/sysdeps/x86_64/multiarch/memset.S +++ b/sysdeps/x86_64/multiarch/memset.S @@ -30,6 +30,13 @@ ENTRY(memset) HAS_ARCH_FEATURE (AVX2_Usable) jz 2f leaq __memset_avx2(%rip), %rax +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + leaq __memset_avx512(%rip), %rax +#endif 2: ret END(memset) #endif diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S index e2abb15..5115dfb 100644 --- a/sysdeps/x86_64/multiarch/memset_chk.S +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -30,6 +30,13 @@ ENTRY(__memset_chk) HAS_ARCH_FEATURE (AVX2_Usable) jz 2f leaq __memset_chk_avx2(%rip), %rax +#ifdef HAVE_AVX512_ASM_SUPPORT + HAS_ARCH_FEATURE (AVX512DQ_Usable) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + leaq __memset_chk_avx512(%rip), %rax +#endif 2: ret END(__memset_chk)