@@ -1,5 +1,21 @@
2014-04-04 Ling Ma <ling.ml@alibaba-inc.com>
+ * sysdeps/x86_64/multiarch/Makefile: Add avx memcpy/mempcpy/memmove
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add support for related
+ flies with avx memcpy
+ * sysdeps/x86_64/multiarch/memcpy.S: Add support for avx memcpy
+ * sysdeps/x86_64/multiarch/memcpy_chk.S: Add support for avx memcpy_chk
+ * sysdeps/x86_64/multiarch/memmove.c: Add support for avx memmove
+ * sysdeps/x86_64/multiarch/memmove_chk.c: Add support for avx memmove_chk
+ * sysdeps/x86_64/multiarch/mempcpy.S: Add support for avx mempcpy
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S: Add support for avx mempcpy_chk
+ * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: New file for avx memcpy
+ * sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: New file for avx mempcpy
+ * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: New file for avx
+ memmove
+
+2014-04-04 Ling Ma <ling.ml@alibaba-inc.com>
+
* sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
* sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
* sysdeps/x86_64/multiarch/memset.S: New file for multiple memset
@@ -11,6 +11,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcmp-sse4 memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+ memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -46,6 +46,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
IFUNC_IMPL (i, name, __memmove_chk,
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX,
+ __memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -55,6 +57,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memmove.S. */
IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX,
+ __memmove_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
@@ -201,6 +205,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */
IFUNC_IMPL (i, name, __memcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX,
+ __memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -210,6 +216,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX,
+ __memcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
@@ -218,6 +226,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
IFUNC_IMPL (i, name, __mempcpy_chk,
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX,
+ __mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
@@ -227,6 +237,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
IFUNC_IMPL (i, name, mempcpy,
+ IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX,
+ __mempcpy_avx_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
new file mode 100644
@@ -0,0 +1,389 @@
+/* memcpy with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by Alibaba Group.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+#ifndef MEMCPY
+# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+#endif
+
+ .section .text.avx,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+ lea (%rsi, %rdx), %r8
+ lea (%rdi, %rdx), %r9
+ cmp $256, %rdx
+ ja L(256bytesormore)
+ cmp $128, %edx
+ jb L(less_128bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups 0x40(%rsi), %xmm4
+ vmovups 0x50(%rsi), %xmm5
+ vmovups 0x60(%rsi), %xmm6
+ vmovups 0x70(%rsi), %xmm7
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, 0x40(%rdi)
+ vmovups %xmm5, 0x50(%rdi)
+ vmovups %xmm6, 0x60(%rdi)
+ vmovups %xmm7, 0x70(%rdi)
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_128bytes):
+ cmp $64, %edx
+ jb L(less_64bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups 0x20(%rsi), %xmm2
+ vmovups 0x30(%rsi), %xmm3
+ vmovups -0x40(%r8), %xmm4
+ vmovups -0x30(%r8), %xmm5
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm2, 0x20(%rdi)
+ vmovups %xmm3, 0x30(%rdi)
+ vmovups %xmm4, -0x40(%r9)
+ vmovups %xmm5, -0x30(%r9)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_64bytes):
+ cmp $32, %edx
+ jb L(less_32bytes)
+ vmovups (%rsi), %xmm0
+ vmovups 0x10(%rsi), %xmm1
+ vmovups -0x20(%r8), %xmm6
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm1, 0x10(%rdi)
+ vmovups %xmm6, -0x20(%r9)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_32bytes):
+ cmp $16, %edx
+ jb L(less_16bytes)
+ vmovups (%rsi), %xmm0
+ vmovups -0x10(%r8), %xmm7
+ vmovups %xmm0, (%rdi)
+ vmovups %xmm7, -0x10(%r9)
+ ret
+ ALIGN(4)
+L(less_16bytes):
+ cmp $8, %edx
+ jb L(less_8bytes)
+ movq (%rsi), %rcx
+ movq -0x08(%r8), %r10
+ movq %rcx, (%rdi)
+ movq %r10, -0x08(%r9)
+ ret
+ ALIGN(4)
+L(less_8bytes):
+ cmp $4, %edx
+ jb L(less_4bytes)
+ mov (%rsi), %ecx
+ mov -0x04(%r8), %edx
+ mov %ecx, (%rdi)
+ mov %edx, -0x04(%r9)
+ ret
+ ALIGN(4)
+L(less_4bytes):
+ cmp $2, %edx
+ jb L(less_2bytes)
+ mov (%rsi), %cx
+ mov -0x02(%r8), %dx
+ mov %cx, (%rdi)
+ mov %dx, -0x02(%r9)
+ ret
+ ALIGN(4)
+L(less_2bytes):
+ cmp $1, %rdx
+ jb L(less_0bytes)
+ mov (%rsi), %cl
+ mov %cl, (%rdi)
+L(less_0bytes):
+ ret
+
+ ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jae L(copy_backward)
+#endif
+ mov %rdi, %r10
+ cmp $2048, %rdx
+ jae L(gobble_data_movsb)
+ vmovups -0x80(%r8), %xmm8
+ vmovups -0x70(%r8), %xmm9
+ and $-32, %rdi
+ add $32, %rdi
+ vmovups -0x60(%r8), %xmm10
+ vmovups -0x50(%r8), %xmm11
+ mov %rdi, %r11
+ sub %r10, %r11
+ vmovups -0x40(%r8), %xmm12
+ vmovups -0x30(%r8), %xmm13
+ sub %r11, %rdx
+ vmovups -0x20(%r8), %xmm14
+ vmovups -0x10(%r8), %xmm15
+ vmovups (%rsi), %ymm4
+ add %r11, %rsi
+ sub $0x80, %rdx
+L(goble_128_loop):
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovaps %ymm0, (%rdi)
+ vmovaps %ymm1, 0x20(%rdi)
+ vmovaps %ymm2, 0x40(%rdi)
+ vmovaps %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(goble_128_loop)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, -0x80(%r9)
+ vmovups %xmm9, -0x70(%r9)
+ vmovups %xmm10, -0x60(%r9)
+ vmovups %xmm11, -0x50(%r9)
+ vmovups %xmm12, -0x40(%r9)
+ vmovups %xmm13, -0x30(%r9)
+ vmovups %xmm14, -0x20(%r9)
+ vmovups %xmm15, -0x10(%r9)
+ ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r10
+ sub %rdi, %r10
+ cmp %rdx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ cmp %rcx, %r10
+ jae L(memmove_use_memcpy_fwd)
+ jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+ cmp %rcx, %rdx
+ jae L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+ mov %rdx, %rcx
+ rep movsb
+ ret
+
+L(gobble_big_data_fwd):
+ vmovups (%rsi), %ymm4
+ vmovups -0x80(%r8), %xmm5
+ vmovups -0x70(%r8), %xmm6
+ vmovups -0x60(%r8), %xmm7
+ vmovups -0x50(%r8), %xmm8
+ vmovups -0x40(%r8), %xmm9
+ vmovups -0x30(%r8), %xmm10
+ vmovups -0x20(%r8), %xmm11
+ vmovups -0x10(%r8), %xmm12
+ mov %rdi, %r8
+ and $-32, %rdi
+ add $32, %rdi
+ mov %rdi, %r10
+ sub %r8, %r10
+ sub %r10, %rdx
+ add %r10, %rsi
+ sub $0x80, %rdx
+L(gobble_mem_fwd_loop):
+ prefetchnta 0x1c0(%rsi)
+ prefetchnta 0x280(%rsi)
+ vmovups (%rsi), %ymm0
+ vmovups 0x20(%rsi), %ymm1
+ vmovups 0x40(%rsi), %ymm2
+ vmovups 0x60(%rsi), %ymm3
+ lea 0x80(%rsi), %rsi
+ vmovntdq %ymm0, (%rdi)
+ vmovntdq %ymm1, 0x20(%rdi)
+ vmovntdq %ymm2, 0x40(%rdi)
+ vmovntdq %ymm3, 0x60(%rdi)
+ lea 0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_fwd_loop)
+ sfence
+ vmovups %ymm4, (%r8)
+ vzeroupper
+ vmovups %xmm5, -0x80(%r9)
+ vmovups %xmm6, -0x70(%r9)
+ vmovups %xmm7, -0x60(%r9)
+ vmovups %xmm8, -0x50(%r9)
+ vmovups %xmm9, -0x40(%r9)
+ vmovups %xmm10, -0x30(%r9)
+ vmovups %xmm11, -0x20(%r9)
+ vmovups %xmm12, -0x10(%r9)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_shared_cache_size_half(%rip), %rcx
+#endif
+ shl $3, %rcx
+ mov %rdi, %r9
+ vmovups (%rsi), %xmm8
+ vmovups 0x10(%rsi), %xmm9
+ add %rdx, %rdi
+ vmovups 0x20(%rsi), %xmm10
+ vmovups 0x30(%rsi), %xmm11
+ lea -0x20(%rdi), %r10
+ mov %rdi, %r11
+ vmovups 0x40(%rsi), %xmm12
+ vmovups 0x50(%rsi), %xmm13
+ and $0x1f, %r11
+ vmovups 0x60(%rsi), %xmm14
+ vmovups 0x70(%rsi), %xmm15
+ xor %r11, %rdi
+ add %rdx, %rsi
+ vmovups -0x20(%rsi), %ymm4
+ sub %r11, %rsi
+ sub %r11, %rdx
+ mov %rdi, %r11
+ sub %rsi, %r11
+ cmp %rdx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ cmp %rcx, %r11
+ jae L(memmove_use_memcpy_bwd)
+ jmp L(gobble_mem_bwd_llc_start)
+L(memmove_use_memcpy_bwd):
+ cmp %rcx, %rdx
+ ja L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_llc):
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovaps %ymm0, -0x20(%rdi)
+ vmovaps %ymm1, -0x40(%rdi)
+ vmovaps %ymm2, -0x60(%rdi)
+ vmovaps %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_llc)
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+L(gobble_big_data_bwd):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_loop):
+ prefetchnta -0x1c0(%rsi)
+ prefetchnta -0x280(%rsi)
+ vmovups -0x20(%rsi), %ymm0
+ vmovups -0x40(%rsi), %ymm1
+ vmovups -0x60(%rsi), %ymm2
+ vmovups -0x80(%rsi), %ymm3
+ lea -0x80(%rsi), %rsi
+ vmovntdq %ymm0, -0x20(%rdi)
+ vmovntdq %ymm1, -0x40(%rdi)
+ vmovntdq %ymm2, -0x60(%rdi)
+ vmovntdq %ymm3, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ sub $0x80, %rdx
+ jae L(gobble_mem_bwd_loop)
+ sfence
+ vmovups %ymm4, (%r10)
+ vzeroupper
+ vmovups %xmm8, (%r9)
+ vmovups %xmm9, 0x10(%r9)
+ vmovups %xmm10, 0x20(%r9)
+ vmovups %xmm11, 0x30(%r9)
+ vmovups %xmm12, 0x40(%r9)
+ vmovups %xmm13, 0x50(%r9)
+ vmovups %xmm14, 0x60(%r9)
+ vmovups %xmm15, 0x70(%r9)
+ ret
+#endif
+END (MEMCPY)
+#endif
@@ -32,6 +32,10 @@ ENTRY(__new_memcpy)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
+1: leaq __memcpy_avx_unaligned(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 1f
+ ret
1: leaq __memcpy_sse2(%rip), %rax
testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jnz 2f
@@ -39,6 +39,9 @@ ENTRY(__memcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __memcpy_chk_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __memcpy_chk_avx_unaligned(%rip), %rax
2: ret
END(__memcpy_chk)
# else
new file mode 100644
@@ -0,0 +1,23 @@
+/* memmove with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by Alibaba Group.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_avx_unaligned
+#define MEMCPY_CHK __memmove_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
@@ -35,6 +35,8 @@
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
+
#endif
#include "string/memmove.c"
@@ -47,10 +49,11 @@ extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
- HAS_SSSE3
+ HAS_AVX ? __memmove_avx_unaligned :
+ (HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)
+ : __memmove_sse2));
strong_alias (__libc_memmove, memmove)
@@ -25,11 +25,13 @@
extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
#include "debug/memmove_chk.c"
libc_ifunc (__memmove_chk,
- HAS_SSSE3
+ HAS_AVX ? __memmove_chk_avx_unaligned :
+ (HAS_SSSE3
? (HAS_FAST_COPY_BACKWARD
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
- : __memmove_chk_sse2);
+ : __memmove_chk_sse2));
new file mode 100644
@@ -0,0 +1,23 @@
+/* mempcpy with AVX
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ Contributed by Alibaba Group.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_avx_unaligned
+#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
+#include "memcpy-avx-unaligned.S"
@@ -37,6 +37,9 @@ ENTRY(__mempcpy)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __mempcpy_avx_unaligned(%rip), %rax
2: ret
END(__mempcpy)
@@ -39,6 +39,9 @@ ENTRY(__mempcpy_chk)
testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
jz 2f
leaq __mempcpy_chk_ssse3_back(%rip), %rax
+ testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+ jz 2f
+ leaq __mempcpy_chk_avx_unaligned(%rip), %rax
2: ret
END(__mempcpy_chk)
# else
From: Ling Ma <ling.ml@alibaba-inc.com> In this patch we manage to reduce miss branch prediction by avoid using branch instructions and force destination to be aligned with avx instruction. The CPU2006 403.gcc benchmark also indicate this patch improves performance from 2% to 12% or 2% to 21% compared with original memset implemented by sse2 and ssse3 respectively. memcpy-AVX memcpy-SSE2 memcpy-SSSE3 AVX vs SSE2 AVX vs SSSE3 gcc.166.i 302551459 332189574 345378682 1.097960575 1.141553517 gcc.200.i 138036144 155904648 168229120 1.129448009 1.218732392 gcc.cp-decl.i 283963419 296759183 312970805 1.045061311 1.102151841 gcc.c-typeck.i 616484068 664855801 682119551 1.078463882 1.106467444 gcc.expr2.i 781639964 858486085 893803320 1.098313961 1.143497468 gcc.expr.i 580765337 593709446 596005444 1.022288019 1.02624142 gcc.g23.i 1063726457 1162692750 1177232886 1.093037352 1.106706408 gcc.s04.i 892109530 948328853 963836294 1.063018409 1.080401298 gcc.scilab.i 62298843 66606465 72922104 1.069144494 1.170521 --- In this version we mix load and alu operation for short size ChangeLog | 16 + sysdeps/x86_64/multiarch/Makefile | 1 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 + sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S | 389 +++++++++++++++++++++++ sysdeps/x86_64/multiarch/memcpy.S | 4 + sysdeps/x86_64/multiarch/memcpy_chk.S | 3 + sysdeps/x86_64/multiarch/memmove-avx-unaligned.S | 23 ++ sysdeps/x86_64/multiarch/memmove.c | 7 +- sysdeps/x86_64/multiarch/memmove_chk.c | 6 +- sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S | 23 ++ sysdeps/x86_64/multiarch/mempcpy.S | 3 + sysdeps/x86_64/multiarch/mempcpy_chk.S | 3 + 12 files changed, 486 insertions(+), 4 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned.S create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S