diff mbox

x86-64: Remove plt bypassing of ifuncs.

Message ID 20150608181414.GA32139@domone
State New
Headers show

Commit Message

Ondřej Bílka June 8, 2015, 6:14 p.m. UTC
Hi,

when I considered cache footprint of strcpy I realized that it doesn't
make sense at all.

First there is comment to save only few cycles which is false on lot of
cases anyway, mainly in memcpy and memset where correct implementation 
easily  makes 20 cycle difference. Most affected is memset where we
could waste hundreds of cycles in calloc just because we want to save
few cycles.

A real problem is that we forgotten about cost of instruction cache again.
We save maybe few cycles when we are on processors that selects default
anyway. However when they differ we will use two implementations, one
could be considerably worse and lose more than three cycles on plt
redirection to instruction cache misses from additional kilobyte
assembly implementation.

Here I removed bypassing from x64 functions, question is how disable
generically bypassing in ifuncs, for normal functions plt bypassing has
benefits but now its something that gcc should do, see recent noplt
proposal on gcc list.

Here I did removal for x64 and guarded hidden_proto with !defined
__x86_64__, I don't know if its better to let other maintainers do same
or come with generic solution.

For x64 I could keep most performance while resolving ifunc by trick
that I use in ufunc, internally use macro to call function as function
pointer, on startup we check cpuid and select correct per-cpu table.
I did some microbenchmark and it looks it has similar performance as
direct call.

When removing these I found that long ago I planed to untangle strcpy
macros where I temporary compiled generic routine of strncpy, stpncpy
and strncat. As generic have poor performance and performance aware user
shouldn't ever use them I replaced these for now with unaligned ones.


Comments?

2015-06-08  Ondřej Bílka  <neleai@seznam.cz>

        * include/string.h [__x86_64__]: Guard hidded_proto of functions.
        * sysdeps/generic/symbol-hacks.h [__x86_64__]: Disable.
        * sysdeps/x86_64/multiarch/ifunc-impl-list.c
	(__libc_ifunc_impl_list): Remove __strncpy_sse2, __stpncpy_sse2,
	__strncat_sse2 entries.
        * sysdeps/x86_64/multiarch/Makefile (routines): Remove strncpy-c,
	stpncpy-c, strncat-c.
        * sysdeps/x86_64/memcpy.S: Remove plt bypassing.
        * sysdeps/x86_64/multiarch/memcmp.S: Likewise.
        * sysdeps/x86_64/multiarch/memcpy.S: Likewise.
        * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
        * sysdeps/x86_64/multiarch/memset.S: Likewise.
        * sysdeps/x86_64/multiarch/strcat.S: Likewise.
        * sysdeps/x86_64/multiarch/strchr.S: Likewise.
        * sysdeps/x86_64/multiarch/strcmp.S: Likewise.
        * sysdeps/x86_64/multiarch/strcpy.S: Likewise.
        * sysdeps/x86_64/multiarch/stpncpy-c.c: Remove.
        * sysdeps/x86_64/multiarch/strncat-c.c: Likewise.
        * sysdeps/x86_64/multiarch/strncpy-c.c: Likewise.

Comments

Joseph Myers June 8, 2015, 8:57 p.m. UTC | #1
On Mon, 8 Jun 2015, Ondřej Bílka wrote:

> Here I removed bypassing from x64 functions, question is how disable
> generically bypassing in ifuncs, for normal functions plt bypassing has
> benefits but now its something that gcc should do, see recent noplt
> proposal on gcc list.
> 
> Here I did removal for x64 and guarded hidden_proto with !defined
> __x86_64__, I don't know if its better to let other maintainers do same
> or come with generic solution.

I think it should work to make the hidden alias be an alias for the IFUNC 
(in the multi-arch case - obviously with --disable-multi-arch it needs to 
continue to exist as an alias for the non-IFUNC, and it's desirable to run 
tests for both cases for any such patch).  That way you shouldn't need to 
change include/string.h at all.  The hidden aliases will get PLT entries, 
but such entries don't cause the localplt test to fail and there are a few 
already anyway.
diff mbox

Patch

diff --git a/include/string.h b/include/string.h
index c57671e..56ccdf1 100644
--- a/include/string.h
+++ b/include/string.h
@@ -72,7 +72,6 @@  extern __typeof (strncasecmp_l) __strncasecmp_l;
 
 libc_hidden_proto (__mempcpy)
 libc_hidden_proto (__stpcpy)
-libc_hidden_proto (__stpncpy)
 libc_hidden_proto (__rawmemchr)
 libc_hidden_proto (__strcasecmp)
 libc_hidden_proto (__strcasecmp_l)
@@ -98,11 +97,15 @@  libc_hidden_proto (__memmem)
 libc_hidden_proto (__ffs)
 
 libc_hidden_builtin_proto (memchr)
+#ifndef __x86_64__
+libc_hidden_proto (__stpncpy)
 libc_hidden_builtin_proto (memcpy)
+libc_hidden_builtin_proto (memset)
+libc_hidden_builtin_proto (strncpy)
+#endif
 libc_hidden_builtin_proto (mempcpy)
 libc_hidden_builtin_proto (memcmp)
 libc_hidden_builtin_proto (memmove)
-libc_hidden_builtin_proto (memset)
 libc_hidden_builtin_proto (strcat)
 libc_hidden_builtin_proto (strchr)
 libc_hidden_builtin_proto (strcmp)
@@ -110,7 +113,6 @@  libc_hidden_builtin_proto (strcpy)
 libc_hidden_builtin_proto (strcspn)
 libc_hidden_builtin_proto (strlen)
 libc_hidden_builtin_proto (strncmp)
-libc_hidden_builtin_proto (strncpy)
 libc_hidden_builtin_proto (strpbrk)
 libc_hidden_builtin_proto (stpcpy)
 libc_hidden_builtin_proto (strrchr)
diff --git a/sysdeps/generic/symbol-hacks.h b/sysdeps/generic/symbol-hacks.h
index ce576c9..fe13b37 100644
--- a/sysdeps/generic/symbol-hacks.h
+++ b/sysdeps/generic/symbol-hacks.h
@@ -1,6 +1,7 @@ 
 /* Some compiler optimizations may transform loops into memset/memmove
    calls and without proper declaration it may generate PLT calls.  */
-#if !defined __ASSEMBLER__ && IS_IN (libc) && defined SHARED
+#if !defined __ASSEMBLER__ && IS_IN (libc) && defined SHARED && \
+    !defined __x86_64__
 asm ("memmove = __GI_memmove");
 asm ("memset = __GI_memset");
 asm ("memcpy = __GI_memcpy");
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index eea8c2a..8356259 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -32,9 +32,6 @@ 
 #  define RETVAL	(-8)
 #  if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
 #    define memcpy	__memcpy
-#    undef libc_hidden_builtin_def
-#    define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy
 #  endif
 #endif
 #define SAVE0	(RETVAL - 8)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d7002a9..43bb74b 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,7 +6,7 @@  endif
 
 ifeq ($(subdir),string)
 
-sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
+sysdep_routines += strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b64e4f1..bd0b20b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -83,8 +83,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_SSSE3,
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
-			      __stpncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
+			      __stpncpy_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.S.  */
   IFUNC_IMPL (i, name, stpcpy,
@@ -174,16 +173,14 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_SSSE3,
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
-			      __strncat_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
+			      __strncat_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/strncpy.S.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_SSSE3,
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
-			      __strncpy_sse2_unaligned)
-	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
+			      __strncpy_sse2_unaligned))
 
   /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b4636..3af38a1 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -57,14 +57,6 @@  END(memcmp)
 # define END(name) \
 	cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
 
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* IFUNC doesn't work with the hidden functions in shared library since
-   they will be called without setting up EBX needed for PLT which is
-   used by IFUNC.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
-# endif
 #endif
 
 #include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 4e18cd3..e65db51 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -70,12 +70,6 @@  END(__new_memcpy)
 # define END_CHK(name) \
 	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
 
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
 
 versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
 #endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index 2eaacdf..c360b94 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -66,15 +66,6 @@  END(__mempcpy)
 # define END_CHK(name) \
 	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
 
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_def(name) \
-	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
-# define libc_hidden_builtin_def(name) \
-	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
 #endif
 
 #include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index c5f1fb3..92499b1d 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -44,14 +44,6 @@  END(memset)
 #  undef __memset_chk
 #  define __memset_chk __memset_chk_sse2
 
-#  ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memset calls through a PLT.
-   The speedup we get from using GPR instruction is likely eaten away
-   by the indirect call in the PLT.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2
-#  endif
 
 #  undef strong_alias
 #  define strong_alias(original, alias)
diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c
deleted file mode 100644
index 2fde77d..0000000
--- a/sysdeps/x86_64/multiarch/stpncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@ 
-#define STPNCPY __stpncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
-  __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2);
-#endif
-
-#include "stpncpy.c"
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
index 44993fa..5303995 100644
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -29,16 +29,12 @@ 
 
 #ifdef USE_AS_STRNCAT
 # define STRCAT_SSSE3	         	__strncat_ssse3
-# define STRCAT_SSE2	            	__strncat_sse2
+# define STRCAT_SSE2	            	__strncat_sse2_unaligned
 # define STRCAT_SSE2_UNALIGNED    	__strncat_sse2_unaligned
-# define __GI_STRCAT	            	__GI_strncat
-# define __GI___STRCAT              __GI___strncat
 #else
 # define STRCAT_SSSE3	         	__strcat_ssse3
 # define STRCAT_SSE2	            	__strcat_sse2
 # define STRCAT_SSE2_UNALIGNED    	__strcat_sse2_unaligned
-# define __GI_STRCAT	            	__GI_strcat
-# define __GI___STRCAT              __GI___strcat
 #endif
 
 
@@ -71,15 +67,6 @@  END(STRCAT)
 # undef END
 # define END(name) \
 	cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcat calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
-	.globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2
 #endif
 
 #ifndef USE_AS_STRNCAT
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index af55fac..c375854 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -48,12 +48,6 @@  END(strchr)
 # undef END
 # define END(name) \
 	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strchr; __GI_strchr = __strchr_sse2
 #endif
 
 #include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c..9799410 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -216,12 +216,6 @@  weak_alias (__strncasecmp, strncasecmp)
 	cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2
 # endif
 
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2
 #endif
 
 #include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 9464ee8..7409598 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -30,28 +30,22 @@ 
 #ifdef USE_AS_STPCPY
 # ifdef USE_AS_STRNCPY
 #  define STRCPY_SSSE3		__stpncpy_ssse3
-#  define STRCPY_SSE2		__stpncpy_sse2
+#  define STRCPY_SSE2		__stpncpy_sse2_unaligned
 #  define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned
-#  define __GI_STRCPY		__GI_stpncpy
-#  define __GI___STRCPY		__GI___stpncpy
 # else
 #  define STRCPY_SSSE3		__stpcpy_ssse3
 #  define STRCPY_SSE2		__stpcpy_sse2
 #  define STRCPY_SSE2_UNALIGNED	__stpcpy_sse2_unaligned
-#  define __GI_STRCPY		__GI_stpcpy
-#  define __GI___STRCPY		__GI___stpcpy
 # endif
 #else
 # ifdef USE_AS_STRNCPY
 #  define STRCPY_SSSE3		__strncpy_ssse3
-#  define STRCPY_SSE2		__strncpy_sse2
+#  define STRCPY_SSE2		__strncpy_sse2_unaligned
 #  define STRCPY_SSE2_UNALIGNED	__strncpy_sse2_unaligned
-#  define __GI_STRCPY		__GI_strncpy
 # else
 #  define STRCPY_SSSE3		__strcpy_ssse3
 #  define STRCPY_SSE2		__strcpy_sse2
 #  define STRCPY_SSE2_UNALIGNED	__strcpy_sse2_unaligned
-#  define __GI_STRCPY		__GI_strcpy
 # endif
 #endif
 
@@ -85,15 +79,6 @@  END(STRCPY)
 # undef END
 # define END(name) \
 	cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
-	.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
 #endif
 
 #ifndef USE_AS_STRNCPY
diff --git a/sysdeps/x86_64/multiarch/strncat-c.c b/sysdeps/x86_64/multiarch/strncat-c.c
deleted file mode 100644
index a3cdbff..0000000
--- a/sysdeps/x86_64/multiarch/strncat-c.c
+++ /dev/null
@@ -1,8 +0,0 @@ 
-#define STRNCAT __strncat_sse2
-#ifdef SHARED
-#undef libc_hidden_def
-#define libc_hidden_def(name) \
-  __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2);
-#endif
-
-#include "string/strncat.c"
diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c
deleted file mode 100644
index 296c32c..0000000
--- a/sysdeps/x86_64/multiarch/strncpy-c.c
+++ /dev/null
@@ -1,8 +0,0 @@ 
-#define STRNCPY __strncpy_sse2
-#ifdef SHARED
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2);
-#endif
-
-#include "strncpy.c"