Message ID | 1396850238-29041-1-git-send-email-ling.ma@alipay.com |
---|---|
State | New |
Headers | show |
Any comments about memcpy/memset/avx detection patches? Thanks Ling 2014-04-07 13:57 GMT+08:00, ling.ma.program@gmail.com <ling.ma.program@gmail.com>: > From: Ling Ma <ling.ml@alibaba-inc.com> > > In this patch we take advantage of HSW memory bandwidth, manage to > reduce miss branch prediction by avoid using branch instructions and > force destination to be aligned with avx instruction. > > The CPU2006 403.gcc benchmark also indicate this patch improves performance > from 22.9% to 59% compared with original memset implemented by sse2. > > memset-AVX memset-SSE2 AVX vs SSE2 > gcc.166.i 1877958334 2495113045 1.328630673 > gcc.200.i 3507448572 4869401205 1.388302952 > gcc.cp-decl.i 1742510758 2282801367 1.310064432 > gcc.c-typeck.i 9546331594 12158804366 1.273662479 > gcc.expr2.i 5067111165 6470777800 1.277015165 > gcc.expr.i 3434703577 4420252661 1.286938614 > gcc.g23.i 5141096267 6318410858 1.22900069 > gcc.s04.i 8652255048 10923077090 1.262454358 > gcc.scilab.i 1209694573 1925173588 1.591454265 > > --- > We fixed code and re-test all cases, including sse2 and avx2. > > ChangeLog | 9 ++ > sysdeps/x86_64/multiarch/Makefile | 4 +- > sysdeps/x86_64/multiarch/memset-avx2.S | 192 > +++++++++++++++++++++++++++++++++ > sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++ > sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++ > 5 files changed, 307 insertions(+), 1 deletion(-) > create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S > create mode 100644 sysdeps/x86_64/multiarch/memset.S > create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S > > diff --git a/ChangeLog b/ChangeLog > index ab23a3a..851fe9e 100644 > --- a/ChangeLog > +++ b/ChangeLog > @@ -1,3 +1,12 @@ > +2014-04-04 Ling Ma <ling.ml@alibaba-inc.com> > + > + * sysdeps/x86_64/multiarch/Makefile: Add memset-avx2 > + * sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset > + * sysdeps/x86_64/multiarch/memset.S: New file for multiple memset > + versions > + * sysdeps/x86_64/multiarch/memset_chk.S: New file for multiple memset_chk > + versions > + > 2014-04-04 Sihai Yao <sihai.ysh@alibaba-inc.com> > > * sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and > diff --git a/sysdeps/x86_64/multiarch/Makefile > b/sysdeps/x86_64/multiarch/Makefile > index 57a3c13..42df96f 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -17,7 +17,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c > strcmp-ssse3 \ > strcpy-sse2-unaligned strncpy-sse2-unaligned \ > stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ > strcat-sse2-unaligned strncat-sse2-unaligned \ > - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned > + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ > + memset-avx2 > + > ifeq (yes,$(config-cflags-sse4)) > sysdep_routines += strcspn-c strpbrk-c strspn-c varshift > CFLAGS-varshift.c += -msse4 > diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S > b/sysdeps/x86_64/multiarch/memset-avx2.S > new file mode 100644 > index 0000000..5d4a487 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-avx2.S > @@ -0,0 +1,192 @@ > +/* memset with AVX2 > + Copyright (C) 2014 Free Software Foundation, Inc. > + Contributed by Alibaba Group. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > + > +#if !defined NOT_IN_libc > + > +#include "asm-syntax.h" > +#ifndef ALIGN > +# define ALIGN(n) .p2align n > +#endif > +#ifndef MEMSET > +# define MEMSET __memset_avx2 > +# define MEMSET_CHK __memset_chk_avx2 > +#endif > + > + .section .text.avx2,"ax",@progbits > +#if defined PIC > +ENTRY (MEMSET_CHK) > + cmpq %rdx, %rcx > + jb HIDDEN_JUMPTARGET (__chk_fail) > +END (MEMSET_CHK) > +#endif > + > +ENTRY (MEMSET) > + vpxor %xmm0, %xmm0, %xmm0 > + vmovd %esi, %xmm1 > + lea (%rdi, %rdx), %r8 > + vpshufb %xmm0, %xmm1, %xmm0 > + mov %rdi, %rax > + cmp $256, %rdx > + jae L(256bytesormore) > + vmovd %xmm0, %rcx > + cmp $128, %rdx > + jb L(less_128bytes) > + vmovups %xmm0, (%rdi) > + vmovups %xmm0, 0x10(%rdi) > + vmovups %xmm0, 0x20(%rdi) > + vmovups %xmm0, 0x30(%rdi) > + vmovups %xmm0, 0x40(%rdi) > + vmovups %xmm0, 0x50(%rdi) > + vmovups %xmm0, 0x60(%rdi) > + vmovups %xmm0, 0x70(%rdi) > + vmovups %xmm0, -0x80(%r8) > + vmovups %xmm0, -0x70(%r8) > + vmovups %xmm0, -0x60(%r8) > + vmovups %xmm0, -0x50(%r8) > + vmovups %xmm0, -0x40(%r8) > + vmovups %xmm0, -0x30(%r8) > + vmovups %xmm0, -0x20(%r8) > + vmovups %xmm0, -0x10(%r8) > + ret > + ALIGN(4) > +L(less_128bytes): > + cmp $64, %edx > + jb L(less_64bytes) > + vmovups %xmm0, (%rdi) > + vmovups %xmm0, 0x10(%rdi) > + vmovups %xmm0, 0x20(%rdi) > + vmovups %xmm0, 0x30(%rdi) > + vmovups %xmm0, -0x40(%r8) > + vmovups %xmm0, -0x30(%r8) > + vmovups %xmm0, -0x20(%r8) > + vmovups %xmm0, -0x10(%r8) > + ret > + ALIGN(4) > +L(less_64bytes): > + cmp $32, %edx > + jb L(less_32bytes) > + vmovups %xmm0, (%rdi) > + vmovups %xmm0, 0x10(%rdi) > + vmovups %xmm0, -0x20(%r8) > + vmovups %xmm0, -0x10(%r8) > + ret > + ALIGN(4) > +L(less_32bytes): > + cmp $16, %edx > + jb L(less_16bytes) > + vmovups %xmm0, (%rdi) > + vmovups %xmm0, -0x10(%r8) > + ret > + ALIGN(4) > +L(less_16bytes): > + cmp $8, %edx > + jb L(less_8bytes) > + mov %rcx, (%rdi) > + mov %rcx, -0x08(%r8) > + ret > + ALIGN(4) > +L(less_8bytes): > + cmp $4, %edx > + jb L(less_4bytes) > + mov %ecx, (%rdi) > + mov %ecx, -0x04(%r8) > + ALIGN(4) > +L(less_4bytes): > + cmp $2, %edx > + jb L(less_2bytes) > + mov %cx, (%rdi) > + mov %cx, -0x02(%r8) > + ret > + ALIGN(4) > +L(less_2bytes): > + cmp $1, %edx > + jb L(less_1bytes) > + mov %cl, (%rdi) > +L(less_1bytes): > + ret > + > + ALIGN(4) > +L(256bytesormore): > + vinserti128 $1, %xmm0, %ymm0, %ymm0 > + vmovups %ymm0, (%rdi) > + mov %rdi, %r9 > + and $-0x20, %rdi > + add $32, %rdi > + sub %rdi, %r9 > + add %r9, %rdx > + cmp $4096, %rdx > + ja L(gobble_data) > + > + sub $0x80, %rdx > +L(gobble_128_loop): > + vmovaps %ymm0, (%rdi) > + vmovaps %ymm0, 0x20(%rdi) > + vmovaps %ymm0, 0x40(%rdi) > + vmovaps %ymm0, 0x60(%rdi) > + lea 0x80(%rdi), %rdi > + sub $0x80, %rdx > + jae L(gobble_128_loop) > + vmovups %ymm0, -0x80(%r8) > + vmovups %ymm0, -0x60(%r8) > + vmovups %ymm0, -0x40(%r8) > + vmovups %ymm0, -0x20(%r8) > + vzeroupper > + ret > + > + ALIGN(4) > +L(gobble_data): > +#ifdef SHARED_CACHE_SIZE_HALF > + mov $SHARED_CACHE_SIZE_HALF, %r9 > +#else > + mov __x86_shared_cache_size_half(%rip), %r9 > +#endif > + shl $4, %r9 > + cmp %r9, %rdx > + ja L(gobble_big_data) > + mov %rax, %r9 > + mov %esi, %eax > + mov %rdx, %rcx > + rep stosb > + mov %r9, %rax > + vzeroupper > + ret > + > + ALIGN(4) > +L(gobble_big_data): > + sub $0x80, %rdx > +L(gobble_big_data_loop): > + vmovntdq %ymm0, (%rdi) > + vmovntdq %ymm0, 0x20(%rdi) > + vmovntdq %ymm0, 0x40(%rdi) > + vmovntdq %ymm0, 0x60(%rdi) > + lea 0x80(%rdi), %rdi > + sub $0x80, %rdx > + jae L(gobble_big_data_loop) > + vmovups %ymm0, -0x80(%r8) > + vmovups %ymm0, -0x60(%r8) > + vmovups %ymm0, -0x40(%r8) > + vmovups %ymm0, -0x20(%r8) > + vzeroupper > + sfence > + ret > + > +END (MEMSET) > +#endif > diff --git a/sysdeps/x86_64/multiarch/memset.S > b/sysdeps/x86_64/multiarch/memset.S > new file mode 100644 > index 0000000..df903af > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset.S > @@ -0,0 +1,59 @@ > +/* Multiple versions of memset > + Copyright (C) 2014 Free Software Foundation, Inc. > + Contributed by Alibaba Group. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <shlib-compat.h> > +#include <init-arch.h> > + > +/* Define multiple versions only for the definition in lib. */ > +#ifndef NOT_IN_libc > +ENTRY(memset) > + .type memset, @gnu_indirect_function > + cmpl $0, __cpu_features+KIND_OFFSET(%rip) > + jne 1f > + call __init_cpu_features > +1: leaq __memset_sse2(%rip), %rax > + testl $bit_AVX2_Usable, > __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) > + jz 2f > + leaq __memset_avx2(%rip), %rax > +2: ret > +END(memset) > +#endif > + > +#if !defined NOT_IN_libc > +# undef memset > +# define memset __memset_sse2 > + > +# undef __memset_chk > +# define __memset_chk __memset_chk_sse2 > + > +# ifdef SHARED > +# undef libc_hidden_builtin_def > +/* It doesn't make sense to send libc-internal memset calls through a PLT. > + The speedup we get from using GPR instruction is likely eaten away > + by the indirect call in the PLT. */ > +# define libc_hidden_builtin_def(name) \ > + .globl __GI_memset; __GI_memset = __memset_sse2 > +# endif > + > +# undef strong_alias > +# define strong_alias(original, alias) > +#endif > + > +#include "../memset.S" > diff --git a/sysdeps/x86_64/multiarch/memset_chk.S > b/sysdeps/x86_64/multiarch/memset_chk.S > new file mode 100644 > index 0000000..f048dac > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset_chk.S > @@ -0,0 +1,44 @@ > +/* Multiple versions of memset_chk > + Copyright (C) 2014 Free Software Foundation, Inc. > + Contributed by Alibaba Group. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + > +#include <sysdep.h> > +#include <init-arch.h> > + > +/* Define multiple versions only for the definition in lib. */ > +#ifndef NOT_IN_libc > +# ifdef SHARED > +ENTRY(__memset_chk) > + .type __memset_chk, @gnu_indirect_function > + cmpl $0, __cpu_features+KIND_OFFSET(%rip) > + jne 1f > + call __init_cpu_features > +1: leaq __memset_chk_sse2(%rip), %rax > + testl $bit_AVX2_Usable, > __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) > + jz 2f > + leaq __memset_chk_avx2(%rip), %rax > +2: ret > +END(__memset_chk) > + > +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) > + .section .gnu.warning.__memset_zero_constant_len_parameter > + .string "memset used with constant zero length parameter; this could be > due to transposed parameters" > +# else > +# include "../memset_chk.S" > +# endif > +#endif > -- > 1.8.1.4 > >
On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote: > From: Ling Ma <ling.ml@alibaba-inc.com> > > In this patch we take advantage of HSW memory bandwidth, manage to > reduce miss branch prediction by avoid using branch instructions and > force destination to be aligned with avx instruction. > > The CPU2006 403.gcc benchmark also indicate this patch improves performance > from 22.9% to 59% compared with original memset implemented by sse2. > Looks mostly ok except mostly mechanic changes. > diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S > new file mode 100644 > index 0000000..5d4a487 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/memset-avx2.S > @@ -0,0 +1,192 @@ > +/* memset with AVX2 > + Copyright (C) 2014 Free Software Foundation, Inc. > + Contributed by Alibaba Group. We no longer add Contributed by. > +#include "asm-syntax.h" > +#ifndef ALIGN > +# define ALIGN(n) .p2align n > +#endif Also in meantime we decided to remove ALIGN macro so remove that and s/ALIGN(3)/.p2align 4/ s/ALIGN(4)/.p2align 4/ > + > +ENTRY (MEMSET) > + vpxor %xmm0, %xmm0, %xmm0 > + vmovd %esi, %xmm1 > + lea (%rdi, %rdx), %r8 < snip > > + vmovups %xmm0, 0x70(%rdi) > + vmovups %xmm0, -0x80(%r8) I would globally replace %r8 by %rsi, this makes instruction byte shorter, %r9 is similar. > +L(less_4bytes): > + cmp $2, %edx > + jb L(less_2bytes) > + mov %cx, (%rdi) > + mov %cx, -0x02(%r8) > + ret > + ALIGN(4) > +L(less_2bytes): > + cmp $1, %edx > + jb L(less_1bytes) > + mov %cl, (%rdi) > +L(less_1bytes): > + ret > + Here current implementation saves one comparison by L(less_4bytes): cmp $1, %edx jbe L(less_2bytes) mov %cx, (%rdi) mov %cx, -0x02(%r8) ret ALIGN(4) L(less_2bytes): jb L(less_1bytes) mov %cl, (%rdi) L(less_1bytes): ret > + ALIGN(4) > +L(256bytesormore): > + vinserti128 $1, %xmm0, %ymm0, %ymm0 > + vmovups %ymm0, (%rdi) > + mov %rdi, %r9 > + and $-0x20, %rdi > + add $32, %rdi > + sub %rdi, %r9 > + add %r9, %rdx > + cmp $4096, %rdx > + ja L(gobble_data) > + > + sub $0x80, %rdx > +L(gobble_128_loop): > + vmovaps %ymm0, (%rdi) > + vmovaps %ymm0, 0x20(%rdi) > + vmovaps %ymm0, 0x40(%rdi) > + vmovaps %ymm0, 0x60(%rdi) > + lea 0x80(%rdi), %rdi > + sub $0x80, %rdx > + jae L(gobble_128_loop) > + vmovups %ymm0, -0x80(%r8) > + vmovups %ymm0, -0x60(%r8) > + vmovups %ymm0, -0x40(%r8) > + vmovups %ymm0, -0x20(%r8) > + vzeroupper > + ret > + I looked into this by objdump and loop is misaligned by 5 bytes which could be problem if haswell could not handle that. If you align loop as below does that it improve performance? .p2align 4 ret; ret; ret; ret; ret L(256bytesormore): also in this pattern > + > + sub $0x80, %rdx A gcc saves three bytes by using add $-0x80, %rdx Third possible optimization is move vmovups before loop which improves latency but it needs to be tested on haswell. > + ALIGN(4) > +L(gobble_data): > +#ifdef SHARED_CACHE_SIZE_HALF > + mov $SHARED_CACHE_SIZE_HALF, %r9 > +#else > + mov __x86_shared_cache_size_half(%rip), %r9 > +#endif typo here, __x86_64_shared_cache_size_half > + shl $4, %r9 > + cmp %r9, %rdx > + ja L(gobble_big_data) > + mov %rax, %r9 > + mov %esi, %eax > + mov %rdx, %rcx > + rep stosb How does this compares with stosq equivalent? > + mov %r9, %rax > + vzeroupper > + ret > +
Hi Ondra, Thanks for your comments, and changed as below, the new version have been sent to you. Regards Ling > >> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S >> b/sysdeps/x86_64/multiarch/memset-avx2.S >> new file mode 100644 >> index 0000000..5d4a487 >> --- /dev/null >> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S >> @@ -0,0 +1,192 @@ >> +/* memset with AVX2 >> + Copyright (C) 2014 Free Software Foundation, Inc. >> + Contributed by Alibaba Group. > > We no longer add Contributed by. Removed in new version. > >> +#include "asm-syntax.h" >> +#ifndef ALIGN >> +# define ALIGN(n) .p2align n >> +#endif > > Also in meantime we decided to remove ALIGN macro so remove that and > s/ALIGN(3)/.p2align 4/ s/ALIGN(4)/.p2align 4/ Fixed in new version > >> + >> +ENTRY (MEMSET) >> + vpxor %xmm0, %xmm0, %xmm0 >> + vmovd %esi, %xmm1 >> + lea (%rdi, %rdx), %r8 > > < snip > > >> + vmovups %xmm0, 0x70(%rdi) >> + vmovups %xmm0, -0x80(%r8) > > I would globally replace %r8 by %rsi, this makes instruction byte shorter, > %r9 is similar. Fixed in new version, and we make branch instruction in different 16byte to improve branch prediction accurate. > >> +L(less_4bytes): >> + cmp $2, %edx >> + jb L(less_2bytes) >> + mov %cx, (%rdi) >> + mov %cx, -0x02(%r8) >> + ret >> + ALIGN(4) >> +L(less_2bytes): >> + cmp $1, %edx >> + jb L(less_1bytes) >> + mov %cl, (%rdi) >> +L(less_1bytes): >> + ret >> + > > Here current implementation saves one comparison by > Did in new version. > L(less_4bytes): > cmp $1, %edx > jbe L(less_2bytes) > mov %cx, (%rdi) > mov %cx, -0x02(%r8) > ret > ALIGN(4) > L(less_2bytes): > jb L(less_1bytes) > mov %cl, (%rdi) > L(less_1bytes): > ret > >> + ALIGN(4) >> +L(256bytesormore): >> + vinserti128 $1, %xmm0, %ymm0, %ymm0 >> + vmovups %ymm0, (%rdi) >> + mov %rdi, %r9 >> + and $-0x20, %rdi >> + add $32, %rdi >> + sub %rdi, %r9 >> + add %r9, %rdx >> + cmp $4096, %rdx >> + ja L(gobble_data) >> + >> + sub $0x80, %rdx >> +L(gobble_128_loop): >> + vmovaps %ymm0, (%rdi) >> + vmovaps %ymm0, 0x20(%rdi) >> + vmovaps %ymm0, 0x40(%rdi) >> + vmovaps %ymm0, 0x60(%rdi) >> + lea 0x80(%rdi), %rdi >> + sub $0x80, %rdx >> + jae L(gobble_128_loop) >> + vmovups %ymm0, -0x80(%r8) >> + vmovups %ymm0, -0x60(%r8) >> + vmovups %ymm0, -0x40(%r8) >> + vmovups %ymm0, -0x20(%r8) >> + vzeroupper >> + ret >> + > > I looked into this by objdump and loop is misaligned by 5 bytes which > could be problem if haswell could not handle that. > If you align loop as below does that it improve performance? > Fixed the issue ,loop is aligned. > .p2align 4 > ret; ret; ret; ret; ret > L(256bytesormore): > > > also in this pattern > >> + >> + sub $0x80, %rdx > > A gcc saves three bytes by using > add $-0x80, %rdx > Changed with similar method > Third possible optimization is move vmovups before loop which improves > latency but it needs to be tested on haswell. > Tested below mode, but it hurt performance, original code could get benefit from hardware prefetch because of sequence access vmovdqu %ymm0, -0x80(%rsi) vmovdqu %ymm0, -0x60(%rsi) vmovdqu %ymm0, -0x40(%rsi) vmovdqu %ymm0, -0x20(%rsi) sub %ecx, %edx L(gobble_128_loop): vmovdqa %ymm0, (%rdi) vmovdqa %ymm0, 0x20(%rdi) vmovdqa %ymm0, 0x40(%rdi) vmovdqa %ymm0, 0x60(%rdi) add %rcx, %rdi sub %ecx, %edx jae L(gobble_128_loop) .... > >> + ALIGN(4) >> +L(gobble_data): >> +#ifdef SHARED_CACHE_SIZE_HALF >> + mov $SHARED_CACHE_SIZE_HALF, %r9 >> +#else >> + mov __x86_shared_cache_size_half(%rip), %r9 >> +#endif > > typo here, __x86_64_shared_cache_size_half __x86_64_shared_cache_size_half will cause crash, so keep __x86_shared_cache_size_half. > >> + shl $4, %r9 >> + cmp %r9, %rdx >> + ja L(gobble_big_data) >> + mov %rax, %r9 >> + mov %esi, %eax >> + mov %rdx, %rcx >> + rep stosb > > How does this compares with stosq equivalent? yes, tested but no improvement. > >> + mov %r9, %rax >> + vzeroupper >> + ret >> + > >
On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote: > From: Ling Ma <ling.ml@alibaba-inc.com> > > In this patch we take advantage of HSW memory bandwidth, manage to > reduce miss branch prediction by avoid using branch instructions and > force destination to be aligned with avx instruction. > Now when we have a haswell machine on our department I tested this implementation. Benchmark used and results are here. http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx130514.tar.bz2 http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx.html This patch improves large inputs and does not regress small inputs much which gives a total 10% improvement on gcc test, it could be improved but it now looks good enough. I tried two alternatives. First is using avx2 in header(memset_fuse). It look it helps, it adds additional 0.5% of performance. However I tried to crosscheck this with bash shell where comparison is in opposite direction so I not entirely sure yet, see http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/memset_profile_avx/results_bash/result.html Second is checking if rep treshold is best one, this depends on application cache layout I do not have definite answer yet (memset_rep and memset_avx_v2 variants), when data is in L2 cache we could lower treshold to 1024 bytes but it slows real inputs for some reason. > The CPU2006 403.gcc benchmark also indicate this patch improves performance > from 22.9% to 59% compared with original memset implemented by sse2. > I inspected that benchmark with my profiler is not that good as its only simple part of gcc and two third of total time is spend on 240 long inputs. A large part of speedup could be explained that avx2 implementation has a special case branch for 128-256 byte range but current one uses loop. These distributions are different from other program and running gcc itself as short inputs are more common there. > + ALIGN(4) > +L(gobble_data): > +#ifdef SHARED_CACHE_SIZE_HALF > + mov $SHARED_CACHE_SIZE_HALF, %r9 > +#else > + mov __x86_shared_cache_size_half(%rip), %r9 > +#endif > + shl $4, %r9 > + cmp %r9, %rdx > + ja L(gobble_big_data) > + mov %rax, %r9 > + mov %esi, %eax > + mov %rdx, %rcx > + rep stosb > + mov %r9, %rax > + vzeroupper > + ret > + > + ALIGN(4) > +L(gobble_big_data): > + sub $0x80, %rdx > +L(gobble_big_data_loop): > + vmovntdq %ymm0, (%rdi) > + vmovntdq %ymm0, 0x20(%rdi) > + vmovntdq %ymm0, 0x40(%rdi) > + vmovntdq %ymm0, 0x60(%rdi) > + lea 0x80(%rdi), %rdi > + sub $0x80, %rdx > + jae L(gobble_big_data_loop) > + vmovups %ymm0, -0x80(%r8) > + vmovups %ymm0, -0x60(%r8) > + vmovups %ymm0, -0x40(%r8) > + vmovups %ymm0, -0x20(%r8) > + vzeroupper > + sfence > + ret That loop does seem to help on haswell at all, It is indistingushible from rep stosb loop above. I used following benchmark to check that with different sizes but performance stayed same. #include <stdlib.h> #include <string.h> int main(){ int i; char *x=malloc(100000000); for (i=0;i<100;i++) MEMSET(x,0,100000000); } for I in `seq 1 10`; do echo avx gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c time LD_LIBRARY_PATH=. ./a.out echo rep gcc -L. -DMEMSET=__memset_rep -lc_profile big.c time LD_LIBRARY_PATH=. ./a.out done
2014-05-14 1:36 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: > > > On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote: >> From: Ling Ma <ling.ml@alibaba-inc.com> >> >> In this patch we take advantage of HSW memory bandwidth, manage to >> reduce miss branch prediction by avoid using branch instructions and >> force destination to be aligned with avx instruction. >> > Now when we have a haswell machine on our department I tested this > implementation. Benchmark used and results are here. > > http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx130514.tar.bz2 > http://kam.mff.cuni.cz/~ondra/benchmark_string/memset_profile_avx.html > > This patch improves large inputs and does not regress > small inputs much which gives a total 10% improvement on gcc test, it > could be improved but it now looks good enough. Ling: Thanks Ondra! you give us many good suggestions and encouragement > I tried two alternatives. First is using avx2 in header(memset_fuse). > It look it helps, it adds additional 0.5% of performance. However I tried > to > crosscheck this with bash shell where comparison is in opposite > direction so I not entirely sure yet, see > > http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/memset_profile_avx/results_bash/result.html > Ling: Yes, we did the experiment on our 403.gcc(I list the download address below), it slows performance between 0 and 256 bytes although another benchmark gave us good result, so I said it hurt performance in my last email. > > Second is checking if rep treshold is best one, > this depends on application cache layout I do not have definite answer > yet (memset_rep and memset_avx_v2 variants), when data is in L2 cache we > could lower treshold to 1024 bytes but it slows real inputs for some > reason. > Ling: Yes, because of this reason, I once tried to use prefetch instruction in the code, and it will also hurt performance when the data is in L1. > >> The CPU2006 403.gcc benchmark also indicate this patch improves >> performance >> from 22.9% to 59% compared with original memset implemented by sse2. >> > I inspected that benchmark with my profiler is not that good as its only > simple > part of gcc and two third of total time is spend on 240 long inputs. Ling: please download www.yunos.org/tmp/test.memcpy.memset.zip, which includes our whole benchmark ,readme.txt and result.xls. we can run & check it. > > A large part of speedup could be explained that avx2 implementation has > a special case branch for 128-256 byte range but current one uses loop. > These distributions are different from other program and running gcc > itself as short inputs are more common there. > > >> + ALIGN(4) >> +L(gobble_data): >> +#ifdef SHARED_CACHE_SIZE_HALF >> + mov $SHARED_CACHE_SIZE_HALF, %r9 >> +#else >> + mov __x86_shared_cache_size_half(%rip), %r9 >> +#endif >> + shl $4, %r9 >> + cmp %r9, %rdx >> + ja L(gobble_big_data) >> + mov %rax, %r9 >> + mov %esi, %eax >> + mov %rdx, %rcx >> + rep stosb >> + mov %r9, %rax >> + vzeroupper >> + ret >> + >> + ALIGN(4) >> +L(gobble_big_data): >> + sub $0x80, %rdx >> +L(gobble_big_data_loop): >> + vmovntdq %ymm0, (%rdi) >> + vmovntdq %ymm0, 0x20(%rdi) >> + vmovntdq %ymm0, 0x40(%rdi) >> + vmovntdq %ymm0, 0x60(%rdi) >> + lea 0x80(%rdi), %rdi >> + sub $0x80, %rdx >> + jae L(gobble_big_data_loop) >> + vmovups %ymm0, -0x80(%r8) >> + vmovups %ymm0, -0x60(%r8) >> + vmovups %ymm0, -0x40(%r8) >> + vmovups %ymm0, -0x20(%r8) >> + vzeroupper >> + sfence >> + ret > > That loop does seem to help on haswell at all, It is indistingushible from > rep stosb loop above. I used following benchmark to check that with > different sizes but performance stayed same. > > #include <stdlib.h> > #include <string.h> > int main(){ > int i; > char *x=malloc(100000000); > for (i=0;i<100;i++) > MEMSET(x,0,100000000); > > } > > > for I in `seq 1 10`; do > echo avx > gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c > time LD_LIBRARY_PATH=. ./a.out > echo rep > gcc -L. -DMEMSET=__memset_rep -lc_profile big.c > time LD_LIBRARY_PATH=. ./a.out > done Ling: Ok, I will test it seriously, then send out new version. Thanks! Ling
Correction, in for following On Tue, May 13, 2014 at 07:36:16PM +0200, Ondřej Bílka wrote: > > + ALIGN(4) > > +L(gobble_data): > > +#ifdef SHARED_CACHE_SIZE_HALF > > + mov $SHARED_CACHE_SIZE_HALF, %r9 > > +#else > > + mov __x86_shared_cache_size_half(%rip), %r9 > > +#endif > > + shl $4, %r9 > > + cmp %r9, %rdx > > + ja L(gobble_big_data) > > + mov %rax, %r9 > > + mov %esi, %eax > > + mov %rdx, %rcx > > + rep stosb > > + mov %r9, %rax > > + vzeroupper > > + ret > > + > > + ALIGN(4) > > +L(gobble_big_data): > > + sub $0x80, %rdx > > +L(gobble_big_data_loop): > > + vmovntdq %ymm0, (%rdi) > > + vmovntdq %ymm0, 0x20(%rdi) > > + vmovntdq %ymm0, 0x40(%rdi) > > + vmovntdq %ymm0, 0x60(%rdi) > > + lea 0x80(%rdi), %rdi > > + sub $0x80, %rdx > > + jae L(gobble_big_data_loop) > > + vmovups %ymm0, -0x80(%r8) > > + vmovups %ymm0, -0x60(%r8) > > + vmovups %ymm0, -0x40(%r8) > > + vmovups %ymm0, -0x20(%r8) > > + vzeroupper > > + sfence > > + ret > > That loop does seem to help on haswell at all, It is indistingushible from > rep stosb loop above. I used following benchmark to check that with > different sizes but performance stayed same. > > #include <stdlib.h> > #include <string.h> > int main(){ > int i; > char *x=malloc(100000000); > for (i=0;i<100;i++) > MEMSET(x,0,100000000); > > } > > > for I in `seq 1 10`; do > echo avx > gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c > time LD_LIBRARY_PATH=. ./a.out > echo rep > gcc -L. -DMEMSET=__memset_rep -lc_profile big.c > time LD_LIBRARY_PATH=. ./a.out > done Sorry I forgotten that __memset_rep also has branch for large inputs so what I wrote was wrong. I retested it with fixed rep stosq and your loop is around 20% slower on similar test so its better to remove that loop. $ gcc big.c -o big $ time LD_PRELOAD=./memset-avx2.so ./big real 0m0.076s user 0m0.066s sys 0m0.010s $ time LD_PRELOAD=./memset_rep.so ./big real 0m0.063s user 0m0.042s sys 0m0.021s I use a different benchmark to be sure, it could be download here and run it commands above in that directory. http://kam.mff.cuni.cz/~ondra/memset_consistency_benchmark.tar.bz2 For different implementation you need to create .so with function memset, there is script compile that compiles all .s files provided that first line is of shape # arch_requirement function_name color
Hi all, Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch When I send patch by git-send-email, libc-alpha@sourceware.org refuse to show it, Sorry for Inconvenience to you Thanks Ling 2014-05-16 4:14 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: > Correction, in for following > > On Tue, May 13, 2014 at 07:36:16PM +0200, Ondřej Bílka wrote: >> > + ALIGN(4) >> > +L(gobble_data): >> > +#ifdef SHARED_CACHE_SIZE_HALF >> > + mov $SHARED_CACHE_SIZE_HALF, %r9 >> > +#else >> > + mov __x86_shared_cache_size_half(%rip), %r9 >> > +#endif >> > + shl $4, %r9 >> > + cmp %r9, %rdx >> > + ja L(gobble_big_data) >> > + mov %rax, %r9 >> > + mov %esi, %eax >> > + mov %rdx, %rcx >> > + rep stosb >> > + mov %r9, %rax >> > + vzeroupper >> > + ret >> > + >> > + ALIGN(4) >> > +L(gobble_big_data): >> > + sub $0x80, %rdx >> > +L(gobble_big_data_loop): >> > + vmovntdq %ymm0, (%rdi) >> > + vmovntdq %ymm0, 0x20(%rdi) >> > + vmovntdq %ymm0, 0x40(%rdi) >> > + vmovntdq %ymm0, 0x60(%rdi) >> > + lea 0x80(%rdi), %rdi >> > + sub $0x80, %rdx >> > + jae L(gobble_big_data_loop) >> > + vmovups %ymm0, -0x80(%r8) >> > + vmovups %ymm0, -0x60(%r8) >> > + vmovups %ymm0, -0x40(%r8) >> > + vmovups %ymm0, -0x20(%r8) >> > + vzeroupper >> > + sfence >> > + ret >> >> That loop does seem to help on haswell at all, It is indistingushible >> from >> rep stosb loop above. I used following benchmark to check that with >> different sizes but performance stayed same. >> >> #include <stdlib.h> >> #include <string.h> >> int main(){ >> int i; >> char *x=malloc(100000000); >> for (i=0;i<100;i++) >> MEMSET(x,0,100000000); >> >> } >> >> >> for I in `seq 1 10`; do >> echo avx >> gcc -L. -DMEMSET=__memset_avx2 -lc_profile big.c >> time LD_LIBRARY_PATH=. ./a.out >> echo rep >> gcc -L. -DMEMSET=__memset_rep -lc_profile big.c >> time LD_LIBRARY_PATH=. ./a.out >> done > > Sorry I forgotten that __memset_rep also has branch for large inputs so > what I wrote was wrong. > > I retested it with fixed rep stosq and your loop is around 20% slower on > similar test so its better to remove that loop. > > $ gcc big.c -o big > $ time LD_PRELOAD=./memset-avx2.so ./big > > real 0m0.076s > user 0m0.066s > sys 0m0.010s > > $ time LD_PRELOAD=./memset_rep.so ./big > > real 0m0.063s > user 0m0.042s > sys 0m0.021s > > I use a different benchmark to be sure, it could be download here and > run it commands above in that directory. > > http://kam.mff.cuni.cz/~ondra/memset_consistency_benchmark.tar.bz2 > > For different implementation you need to create .so with function > memset, there is script compile that compiles all .s files provided that > first line is of shape > > # arch_requirement function_name color > >
On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote: > Hi all, > > Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch > > When I send patch by git-send-email, libc-alpha@sourceware.org refuse > to show it, > Sorry for Inconvenience to you > It looks its typo in adress, yonos.org does not exist.
Ondra, I retried and get the patch from http://www.yunos.org/tmp/memset-avx2.patch it is yunos.org . Thanks Ling 2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: > On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote: >> Hi all, >> >> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch >> >> When I send patch by git-send-email, libc-alpha@sourceware.org refuse >> to show it, >> Sorry for Inconvenience to you >> > It looks its typo in adress, yonos.org does not exist. > > >
On Fri, May 30, 2014 at 7:10 AM, Ling Ma <ling.ma.program@gmail.com> wrote: > Ondra, > > I retried and get the patch from http://www.yunos.org/tmp/memset-avx2.patch > it is yunos.org . http://www.yunos.org/tmp/memset-avx2.patch times out for me. Can you gzip the patch and send it as an attachment? Thanks. > Thanks > Ling > > > 2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: >> On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote: >>> Hi all, >>> >>> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch >>> >>> When I send patch by git-send-email, libc-alpha@sourceware.org refuse >>> to show it, >>> Sorry for Inconvenience to you >>> >> It looks its typo in adress, yonos.org does not exist. >> >> >>
H.J The website changed IP, now the code is available again: http://www.yunos.org/tmp/memset-avx2.patch , and also gziped as attachment in this mail. Thanks Ling 2014-06-04 7:56 GMT+08:00, H.J. Lu <hjl.tools@gmail.com>: > On Fri, May 30, 2014 at 7:10 AM, Ling Ma <ling.ma.program@gmail.com> wrote: >> Ondra, >> >> I retried and get the patch from >> http://www.yunos.org/tmp/memset-avx2.patch >> it is yunos.org . > > http://www.yunos.org/tmp/memset-avx2.patch > > times out for me. Can you gzip the patch and send it as > an attachment? > > Thanks. > >> Thanks >> Ling >> >> >> 2014-05-30 19:30 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: >>> On Fri, May 30, 2014 at 05:02:29PM +0800, Ling Ma wrote: >>>> Hi all, >>>> >>>> Here is latest memset pach: http://www.yunos.org/tmp/memset-avx2.patch >>>> >>>> When I send patch by git-send-email, libc-alpha@sourceware.org refuse >>>> to show it, >>>> Sorry for Inconvenience to you >>>> >>> It looks its typo in adress, yonos.org does not exist. >>> >>> >>> > > > > -- > H.J. >
On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote: > H.J > > The website changed IP, now the code is available again: > http://www.yunos.org/tmp/memset-avx2.patch , > and also gziped as attachment in this mail. > > Thanks > Ling > Now performance looks ok for me, but few formating problems. With these fixed I would be satisfied H.J do you have comments? There is possible followup to also optimize __bzero like we do in general case. Then second followup would be decrease function size by reshuffling blocks, on several places there are 15/16 free bytes due alignment. Formatting problems are here: + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + mov %rdi, %rsi + mov %rdi, %rax here +L(less_16bytes): + vmovd %xmm0, %rcx + cmp $8, %dl + jb L(less_8bytes) + mov %rcx, (%rdi) + mov %rcx, -0x08(%rsi) + ret + + .p2align 4 +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret and here + mov %rax, %rsi + vmovd %xmm0, %eax + mov %rdx, %rcx As I mentioned code size one trick is that instructions with -128 argument are shorter than with 128. You could save 16 bytes with following modification, however it must be tested if it improves performance. --- x 2014-06-05 18:20:35.313645591 +0200 +++ sysdeps/x86_64/multiarch/memset-avx2.S 2014-06-05 18:22:25.068642767 +0200 @@ -95,7 +95,6 @@ .p2align 4 L(256bytesormore): vinserti128 $1, %xmm0, %ymm0, %ymm0 - mov $0x80, %rcx add %rdx, %rsi mov %rdi, %r9 vmovdqu %ymm0, (%rdi) @@ -105,15 +104,15 @@ add %r9, %rdx cmp $4096, %rdx ja L(gobble_data) - sub %ecx, %edx + add $-128, %edx L(gobble_128_loop): vmovdqa %ymm0, (%rdi) vmovdqa %ymm0, 0x20(%rdi) vmovdqa %ymm0, 0x40(%rdi) vmovdqa %ymm0, 0x60(%rdi) - add %rcx, %rdi - sub %ecx, %edx - jae L(gobble_128_loop) + sub $-128, %rdi + add $-128, %edx + jb L(gobble_128_loop) vmovdqu %ymm0, -0x80(%rsi) vmovdqu %ymm0, -0x60(%rsi) vmovdqu %ymm0, -0x40(%rsi)
On Thu, Jun 5, 2014 at 9:32 AM, Ondřej Bílka <neleai@seznam.cz> wrote: > On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote: >> H.J >> >> The website changed IP, now the code is available again: >> http://www.yunos.org/tmp/memset-avx2.patch , >> and also gziped as attachment in this mail. >> >> Thanks >> Ling >> > > Now performance looks ok for me, but few formating problems. > With these fixed I would be satisfied H.J do you have comments? I don't have any additional comments. Thanks. > There is possible followup to also optimize __bzero like we do in > general case. > > Then second followup would be decrease function size by reshuffling > blocks, on several places there are 15/16 free bytes due alignment. > > Formatting problems are here: > > + vpxor %xmm0, %xmm0, %xmm0 > + vmovd %esi, %xmm1 > + mov %rdi, %rsi > + mov %rdi, %rax > > here > > +L(less_16bytes): > + vmovd %xmm0, %rcx > + cmp $8, %dl > + jb L(less_8bytes) > + mov %rcx, (%rdi) > + mov %rcx, -0x08(%rsi) > + ret > + > + .p2align 4 > +L(less_8bytes): > + cmp $4, %dl > + jb L(less_4bytes) > + mov %ecx, (%rdi) > + mov %ecx, -0x04(%rsi) > + ret > > and here > > + mov %rax, %rsi > + vmovd %xmm0, %eax > + mov %rdx, %rcx > > As I mentioned code size one trick is that instructions > with -128 argument are shorter than with 128. You could save 16 > bytes with following modification, however it must be tested if > it improves performance. > > > --- x 2014-06-05 18:20:35.313645591 +0200 > +++ sysdeps/x86_64/multiarch/memset-avx2.S 2014-06-05 > 18:22:25.068642767 +0200 > @@ -95,7 +95,6 @@ > .p2align 4 > L(256bytesormore): > vinserti128 $1, %xmm0, %ymm0, %ymm0 > - mov $0x80, %rcx > add %rdx, %rsi > mov %rdi, %r9 > vmovdqu %ymm0, (%rdi) > @@ -105,15 +104,15 @@ > add %r9, %rdx > cmp $4096, %rdx > ja L(gobble_data) > - sub %ecx, %edx > + add $-128, %edx > L(gobble_128_loop): > vmovdqa %ymm0, (%rdi) > vmovdqa %ymm0, 0x20(%rdi) > vmovdqa %ymm0, 0x40(%rdi) > vmovdqa %ymm0, 0x60(%rdi) > - add %rcx, %rdi > - sub %ecx, %edx > - jae L(gobble_128_loop) > + sub $-128, %rdi > + add $-128, %edx > + jb L(gobble_128_loop) > vmovdqu %ymm0, -0x80(%rsi) > vmovdqu %ymm0, -0x60(%rsi) > vmovdqu %ymm0, -0x40(%rsi) >
I will send the next version after performance test according to your comments. Thanks Ling 2014-06-06 0:32 GMT+08:00, Ondřej Bílka <neleai@seznam.cz>: > On Wed, Jun 04, 2014 at 03:00:05PM +0800, Ling Ma wrote: >> H.J >> >> The website changed IP, now the code is available again: >> http://www.yunos.org/tmp/memset-avx2.patch , >> and also gziped as attachment in this mail. >> >> Thanks >> Ling >> > > Now performance looks ok for me, but few formating problems. > With these fixed I would be satisfied H.J do you have comments? > > There is possible followup to also optimize __bzero like we do in > general case. > > Then second followup would be decrease function size by reshuffling > blocks, on several places there are 15/16 free bytes due alignment. > > Formatting problems are here: > > + vpxor %xmm0, %xmm0, %xmm0 > + vmovd %esi, %xmm1 > + mov %rdi, %rsi > + mov %rdi, %rax > > here > > +L(less_16bytes): > + vmovd %xmm0, %rcx > + cmp $8, %dl > + jb L(less_8bytes) > + mov %rcx, (%rdi) > + mov %rcx, -0x08(%rsi) > + ret > + > + .p2align 4 > +L(less_8bytes): > + cmp $4, %dl > + jb L(less_4bytes) > + mov %ecx, (%rdi) > + mov %ecx, -0x04(%rsi) > + ret > > and here > > + mov %rax, %rsi > + vmovd %xmm0, %eax > + mov %rdx, %rcx > > As I mentioned code size one trick is that instructions > with -128 argument are shorter than with 128. You could save 16 > bytes with following modification, however it must be tested if > it improves performance. > > > --- x 2014-06-05 18:20:35.313645591 +0200 > +++ sysdeps/x86_64/multiarch/memset-avx2.S 2014-06-05 > 18:22:25.068642767 +0200 > @@ -95,7 +95,6 @@ > .p2align 4 > L(256bytesormore): > vinserti128 $1, %xmm0, %ymm0, %ymm0 > - mov $0x80, %rcx > add %rdx, %rsi > mov %rdi, %r9 > vmovdqu %ymm0, (%rdi) > @@ -105,15 +104,15 @@ > add %r9, %rdx > cmp $4096, %rdx > ja L(gobble_data) > - sub %ecx, %edx > + add $-128, %edx > L(gobble_128_loop): > vmovdqa %ymm0, (%rdi) > vmovdqa %ymm0, 0x20(%rdi) > vmovdqa %ymm0, 0x40(%rdi) > vmovdqa %ymm0, 0x60(%rdi) > - add %rcx, %rdi > - sub %ecx, %edx > - jae L(gobble_128_loop) > + sub $-128, %rdi > + add $-128, %edx > + jb L(gobble_128_loop) > vmovdqu %ymm0, -0x80(%rsi) > vmovdqu %ymm0, -0x60(%rsi) > vmovdqu %ymm0, -0x40(%rsi) > >
In this patch as gziped attachment, we take advantage of HSW memory bandwidth, manage to reduce miss branch prediction by avoiding using branch instructions and force destination to be aligned with avx & avx2 instruction. The CPU2006 403.gcc benchmark indicates this patch improves performance from 26% to 59%. This version accept Ondra's comments and avoid branch instruction to cross 16byte-aligned code. Thanks Ling
On Tue, Jun 10, 2014 at 6:52 AM, Ling Ma <ling.ma.program@gmail.com> wrote: > In this patch as gziped attachment, we take advantage of HSW memory > bandwidth, manage to reduce miss branch prediction by avoiding using > branch instructions and > force destination to be aligned with avx & avx2 instruction. > > The CPU2006 403.gcc benchmark indicates this patch improves performance > from 26% to 59%. > > This version accept Ondra's comments and avoid branch instruction to > cross 16byte-aligned code. Any feedback? I'd like to check it in before 2.20 code freeze. Thanks.
On Wed, Jun 18, 2014 at 09:47:11AM -0700, H.J. Lu wrote: > On Tue, Jun 10, 2014 at 6:52 AM, Ling Ma <ling.ma.program@gmail.com> wrote: > > In this patch as gziped attachment, we take advantage of HSW memory > > bandwidth, manage to reduce miss branch prediction by avoiding using > > branch instructions and > > force destination to be aligned with avx & avx2 instruction. > > > > The CPU2006 403.gcc benchmark indicates this patch improves performance > > from 26% to 59%. > > > > This version accept Ondra's comments and avoid branch instruction to > > cross 16byte-aligned code. > > Any feedback? I'd like to check it in before 2.20 code freeze. > As I said before its ok with fixed formatting, you could commit it if you wish.
diff --git a/ChangeLog b/ChangeLog index ab23a3a..851fe9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +2014-04-04 Ling Ma <ling.ml@alibaba-inc.com> + + * sysdeps/x86_64/multiarch/Makefile: Add memset-avx2 + * sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset + * sysdeps/x86_64/multiarch/memset.S: New file for multiple memset + versions + * sysdeps/x86_64/multiarch/memset_chk.S: New file for multiple memset_chk + versions + 2014-04-04 Sihai Yao <sihai.ysh@alibaba-inc.com> * sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 57a3c13..42df96f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ + memset-avx2 + ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S new file mode 100644 index 0000000..5d4a487 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2.S @@ -0,0 +1,192 @@ +/* memset with AVX2 + Copyright (C) 2014 Free Software Foundation, Inc. + Contributed by Alibaba Group. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if !defined NOT_IN_libc + +#include "asm-syntax.h" +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif +#ifndef MEMSET +# define MEMSET __memset_avx2 +# define MEMSET_CHK __memset_chk_avx2 +#endif + + .section .text.avx2,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %r8 + vpshufb %xmm0, %xmm1, %xmm0 + mov %rdi, %rax + cmp $256, %rdx + jae L(256bytesormore) + vmovd %xmm0, %rcx + cmp $128, %rdx + jb L(less_128bytes) + vmovups %xmm0, (%rdi) + vmovups %xmm0, 0x10(%rdi) + vmovups %xmm0, 0x20(%rdi) + vmovups %xmm0, 0x30(%rdi) + vmovups %xmm0, 0x40(%rdi) + vmovups %xmm0, 0x50(%rdi) + vmovups %xmm0, 0x60(%rdi) + vmovups %xmm0, 0x70(%rdi) + vmovups %xmm0, -0x80(%r8) + vmovups %xmm0, -0x70(%r8) + vmovups %xmm0, -0x60(%r8) + vmovups %xmm0, -0x50(%r8) + vmovups %xmm0, -0x40(%r8) + vmovups %xmm0, -0x30(%r8) + vmovups %xmm0, -0x20(%r8) + vmovups %xmm0, -0x10(%r8) + ret + ALIGN(4) +L(less_128bytes): + cmp $64, %edx + jb L(less_64bytes) + vmovups %xmm0, (%rdi) + vmovups %xmm0, 0x10(%rdi) + vmovups %xmm0, 0x20(%rdi) + vmovups %xmm0, 0x30(%rdi) + vmovups %xmm0, -0x40(%r8) + vmovups %xmm0, -0x30(%r8) + vmovups %xmm0, -0x20(%r8) + vmovups %xmm0, -0x10(%r8) + ret + ALIGN(4) +L(less_64bytes): + cmp $32, %edx + jb L(less_32bytes) + vmovups %xmm0, (%rdi) + vmovups %xmm0, 0x10(%rdi) + vmovups %xmm0, -0x20(%r8) + vmovups %xmm0, -0x10(%r8) + ret + ALIGN(4) +L(less_32bytes): + cmp $16, %edx + jb L(less_16bytes) + vmovups %xmm0, (%rdi) + vmovups %xmm0, -0x10(%r8) + ret + ALIGN(4) +L(less_16bytes): + cmp $8, %edx + jb L(less_8bytes) + mov %rcx, (%rdi) + mov %rcx, -0x08(%r8) + ret + ALIGN(4) +L(less_8bytes): + cmp $4, %edx + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%r8) + ALIGN(4) +L(less_4bytes): + cmp $2, %edx + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%r8) + ret + ALIGN(4) +L(less_2bytes): + cmp $1, %edx + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + + ALIGN(4) +L(256bytesormore): + vinserti128 $1, %xmm0, %ymm0, %ymm0 + vmovups %ymm0, (%rdi) + mov %rdi, %r9 + and $-0x20, %rdi + add $32, %rdi + sub %rdi, %r9 + add %r9, %rdx + cmp $4096, %rdx + ja L(gobble_data) + + sub $0x80, %rdx +L(gobble_128_loop): + vmovaps %ymm0, (%rdi) + vmovaps %ymm0, 0x20(%rdi) + vmovaps %ymm0, 0x40(%rdi) + vmovaps %ymm0, 0x60(%rdi) + lea 0x80(%rdi), %rdi + sub $0x80, %rdx + jae L(gobble_128_loop) + vmovups %ymm0, -0x80(%r8) + vmovups %ymm0, -0x60(%r8) + vmovups %ymm0, -0x40(%r8) + vmovups %ymm0, -0x20(%r8) + vzeroupper + ret + + ALIGN(4) +L(gobble_data): +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r9 +#else + mov __x86_shared_cache_size_half(%rip), %r9 +#endif + shl $4, %r9 + cmp %r9, %rdx + ja L(gobble_big_data) + mov %rax, %r9 + mov %esi, %eax + mov %rdx, %rcx + rep stosb + mov %r9, %rax + vzeroupper + ret + + ALIGN(4) +L(gobble_big_data): + sub $0x80, %rdx +L(gobble_big_data_loop): + vmovntdq %ymm0, (%rdi) + vmovntdq %ymm0, 0x20(%rdi) + vmovntdq %ymm0, 0x40(%rdi) + vmovntdq %ymm0, 0x60(%rdi) + lea 0x80(%rdi), %rdi + sub $0x80, %rdx + jae L(gobble_big_data_loop) + vmovups %ymm0, -0x80(%r8) + vmovups %ymm0, -0x60(%r8) + vmovups %ymm0, -0x40(%r8) + vmovups %ymm0, -0x20(%r8) + vzeroupper + sfence + ret + +END (MEMSET) +#endif diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S new file mode 100644 index 0000000..df903af --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset.S @@ -0,0 +1,59 @@ +/* Multiple versions of memset + Copyright (C) 2014 Free Software Foundation, Inc. + Contributed by Alibaba Group. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <shlib-compat.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +ENTRY(memset) + .type memset, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __memset_sse2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + leaq __memset_avx2(%rip), %rax +2: ret +END(memset) +#endif + +#if !defined NOT_IN_libc +# undef memset +# define memset __memset_sse2 + +# undef __memset_chk +# define __memset_chk __memset_chk_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memset calls through a PLT. + The speedup we get from using GPR instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S new file mode 100644 index 0000000..f048dac --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset_chk.S @@ -0,0 +1,44 @@ +/* Multiple versions of memset_chk + Copyright (C) 2014 Free Software Foundation, Inc. + Contributed by Alibaba Group. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#ifndef NOT_IN_libc +# ifdef SHARED +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __memset_chk_sse2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + leaq __memset_chk_avx2(%rip), %rax +2: ret +END(__memset_chk) + +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else +# include "../memset_chk.S" +# endif +#endif
From: Ling Ma <ling.ml@alibaba-inc.com> In this patch we take advantage of HSW memory bandwidth, manage to reduce miss branch prediction by avoid using branch instructions and force destination to be aligned with avx instruction. The CPU2006 403.gcc benchmark also indicate this patch improves performance from 22.9% to 59% compared with original memset implemented by sse2. memset-AVX memset-SSE2 AVX vs SSE2 gcc.166.i 1877958334 2495113045 1.328630673 gcc.200.i 3507448572 4869401205 1.388302952 gcc.cp-decl.i 1742510758 2282801367 1.310064432 gcc.c-typeck.i 9546331594 12158804366 1.273662479 gcc.expr2.i 5067111165 6470777800 1.277015165 gcc.expr.i 3434703577 4420252661 1.286938614 gcc.g23.i 5141096267 6318410858 1.22900069 gcc.s04.i 8652255048 10923077090 1.262454358 gcc.scilab.i 1209694573 1925173588 1.591454265 --- We fixed code and re-test all cases, including sse2 and avx2. ChangeLog | 9 ++ sysdeps/x86_64/multiarch/Makefile | 4 +- sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++ sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++ sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++ 5 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S create mode 100644 sysdeps/x86_64/multiarch/memset.S create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S