Message ID | 20140318100138.GC8415@domone.podge |
---|---|
State | New |
Headers | show |
ping On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > To make a strtok faster and improve performance in general we need to do one > additional change. > > A comment: > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > The speedup we get from using SSE4.2 instruction is likely eaten away > by the indirect call in the PLT. */ > > Does not make sense at all because nobody bothered to check it. Gap > between these implementations is quite big, when haystack is empty a > sse2 is around 40 cycles slower because it needs to populate a lookup > table and difference only increases with size. That is much bigger than > plt slowdown which is few cycles. > > Even benchtest show a gap which also may be reverse by branch > misprediction but my internal benchmark shown. > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > This patch also handles strpbrk which is implemented by including a > x86_64/multiarch/strcspn.S file. > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > index 24f55e9..1b3e1aa 100644 > --- a/sysdeps/x86_64/multiarch/strcspn.S > +++ b/sysdeps/x86_64/multiarch/strcspn.S > @@ -65,14 +65,7 @@ END(STRCSPN) > # undef END > # define END(name) \ > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > -# undef libc_hidden_builtin_def > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > - The speedup we get from using SSE4.2 instruction is likely eaten away > - by the indirect call in the PLT. */ > -# define libc_hidden_builtin_def(name) \ > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > #endif > - > #endif /* HAVE_SSE4_SUPPORT */ > > #ifdef USE_AS_STRPBRK > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > index bf7308e..fde1e1e 100644 > --- a/sysdeps/x86_64/multiarch/strspn.S > +++ b/sysdeps/x86_64/multiarch/strspn.S > @@ -50,12 +50,6 @@ END(strspn) > # undef END > # define END(name) \ > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > -# undef libc_hidden_builtin_def > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > - The speedup we get from using SSE4.2 instruction is likely eaten away > - by the indirect call in the PLT. */ > -# define libc_hidden_builtin_def(name) \ > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > #endif > > #endif /* HAVE_SSE4_SUPPORT */
ping On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > ping > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > To make a strtok faster and improve performance in general we need to do one > > additional change. > > > > A comment: > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > The speedup we get from using SSE4.2 instruction is likely eaten away > > by the indirect call in the PLT. */ > > > > Does not make sense at all because nobody bothered to check it. Gap > > between these implementations is quite big, when haystack is empty a > > sse2 is around 40 cycles slower because it needs to populate a lookup > > table and difference only increases with size. That is much bigger than > > plt slowdown which is few cycles. > > > > Even benchtest show a gap which also may be reverse by branch > > misprediction but my internal benchmark shown. > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > This patch also handles strpbrk which is implemented by including a > > x86_64/multiarch/strcspn.S file. > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > index 24f55e9..1b3e1aa 100644 > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > @@ -65,14 +65,7 @@ END(STRCSPN) > > # undef END > > # define END(name) \ > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > -# undef libc_hidden_builtin_def > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > - by the indirect call in the PLT. */ > > -# define libc_hidden_builtin_def(name) \ > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > #endif > > - > > #endif /* HAVE_SSE4_SUPPORT */ > > > > #ifdef USE_AS_STRPBRK > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > index bf7308e..fde1e1e 100644 > > --- a/sysdeps/x86_64/multiarch/strspn.S > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > @@ -50,12 +50,6 @@ END(strspn) > > # undef END > > # define END(name) \ > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > -# undef libc_hidden_builtin_def > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > - by the indirect call in the PLT. */ > > -# define libc_hidden_builtin_def(name) \ > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > #endif > > > > #endif /* HAVE_SSE4_SUPPORT */ > > -- > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible!
On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > ping > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > ping > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > To make a strtok faster and improve performance in general we need to do one > > > additional change. > > > > > > A comment: > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > by the indirect call in the PLT. */ > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > between these implementations is quite big, when haystack is empty a > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > table and difference only increases with size. That is much bigger than > > > plt slowdown which is few cycles. > > > > > > Even benchtest show a gap which also may be reverse by branch > > > misprediction but my internal benchmark shown. > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > This patch also handles strpbrk which is implemented by including a > > > x86_64/multiarch/strcspn.S file. > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > index 24f55e9..1b3e1aa 100644 > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > # undef END > > > # define END(name) \ > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > -# undef libc_hidden_builtin_def > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > - by the indirect call in the PLT. */ > > > -# define libc_hidden_builtin_def(name) \ > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > #endif > > > - > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > #ifdef USE_AS_STRPBRK > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > index bf7308e..fde1e1e 100644 > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > @@ -50,12 +50,6 @@ END(strspn) > > > # undef END > > > # define END(name) \ > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > -# undef libc_hidden_builtin_def > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > - by the indirect call in the PLT. */ > > > -# define libc_hidden_builtin_def(name) \ > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > #endif > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > -- > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > -- > > Look, buddy: Windows 3.1 IS A General Protection Fault.
ping On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > ping > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > ping > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > To make a strtok faster and improve performance in general we need to do one > > > > additional change. > > > > > > > > A comment: > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > by the indirect call in the PLT. */ > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > between these implementations is quite big, when haystack is empty a > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > table and difference only increases with size. That is much bigger than > > > > plt slowdown which is few cycles. > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > misprediction but my internal benchmark shown. > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > index 24f55e9..1b3e1aa 100644 > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > # undef END > > > > # define END(name) \ > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > -# undef libc_hidden_builtin_def > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > - by the indirect call in the PLT. */ > > > > -# define libc_hidden_builtin_def(name) \ > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > #endif > > > > - > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > #ifdef USE_AS_STRPBRK > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > index bf7308e..fde1e1e 100644 > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > # undef END > > > > # define END(name) \ > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > -# undef libc_hidden_builtin_def > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > - by the indirect call in the PLT. */ > > > > -# define libc_hidden_builtin_def(name) \ > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > #endif > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > -- > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > -- > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > -- > > Failure to adjust for daylight savings time.
ping On Mon, May 12, 2014 at 02:00:11PM +0200, Ondřej Bílka wrote: > ping > On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > > ping > > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > > ping > > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > > To make a strtok faster and improve performance in general we need to do one > > > > > additional change. > > > > > > > > > > A comment: > > > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > by the indirect call in the PLT. */ > > > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > > between these implementations is quite big, when haystack is empty a > > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > > table and difference only increases with size. That is much bigger than > > > > > plt slowdown which is few cycles. > > > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > > misprediction but my internal benchmark shown. > > > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > > index 24f55e9..1b3e1aa 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > > # undef END > > > > > # define END(name) \ > > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > > -# undef libc_hidden_builtin_def > > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > - by the indirect call in the PLT. */ > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > > #endif > > > > > - > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > #ifdef USE_AS_STRPBRK > > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > > index bf7308e..fde1e1e 100644 > > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > > # undef END > > > > > # define END(name) \ > > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > > -# undef libc_hidden_builtin_def > > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > - by the indirect call in the PLT. */ > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > > #endif > > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > -- > > > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > > > -- > > > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > > > -- > > > > Failure to adjust for daylight savings time. > > -- > > monitor VLF leakage
ping On Sat, May 24, 2014 at 01:23:13AM +0200, Ondřej Bílka wrote: > ping > On Mon, May 12, 2014 at 02:00:11PM +0200, Ondřej Bílka wrote: > > ping > > On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > > > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > > > ping > > > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > > > ping > > > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > > > To make a strtok faster and improve performance in general we need to do one > > > > > > additional change. > > > > > > > > > > > > A comment: > > > > > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > by the indirect call in the PLT. */ > > > > > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > > > between these implementations is quite big, when haystack is empty a > > > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > > > table and difference only increases with size. That is much bigger than > > > > > > plt slowdown which is few cycles. > > > > > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > > > misprediction but my internal benchmark shown. > > > > > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > index 24f55e9..1b3e1aa 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > > > # undef END > > > > > > # define END(name) \ > > > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > > > -# undef libc_hidden_builtin_def > > > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > - by the indirect call in the PLT. */ > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > > > #endif > > > > > > - > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > #ifdef USE_AS_STRPBRK > > > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > > > index bf7308e..fde1e1e 100644 > > > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > > > # undef END > > > > > > # define END(name) \ > > > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > > > -# undef libc_hidden_builtin_def > > > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > - by the indirect call in the PLT. */ > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > > > #endif > > > > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > -- > > > > > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > > > > > -- > > > > > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > > > > > -- > > > > > > Failure to adjust for daylight savings time. > > > > -- > > > > monitor VLF leakage > > -- > > Stale file handle (next time use Tupperware(tm)!)
ping On Wed, Jun 04, 2014 at 02:47:54PM +0200, Ondřej Bílka wrote: > ping > On Sat, May 24, 2014 at 01:23:13AM +0200, Ondřej Bílka wrote: > > ping > > On Mon, May 12, 2014 at 02:00:11PM +0200, Ondřej Bílka wrote: > > > ping > > > On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > > > > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > > > > ping > > > > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > > > > ping > > > > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > > > > To make a strtok faster and improve performance in general we need to do one > > > > > > > additional change. > > > > > > > > > > > > > > A comment: > > > > > > > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > by the indirect call in the PLT. */ > > > > > > > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > > > > between these implementations is quite big, when haystack is empty a > > > > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > > > > table and difference only increases with size. That is much bigger than > > > > > > > plt slowdown which is few cycles. > > > > > > > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > > > > misprediction but my internal benchmark shown. > > > > > > > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > index 24f55e9..1b3e1aa 100644 > > > > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > > > > # undef END > > > > > > > # define END(name) \ > > > > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > - by the indirect call in the PLT. */ > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > > > > #endif > > > > > > > - > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > > > #ifdef USE_AS_STRPBRK > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > index bf7308e..fde1e1e 100644 > > > > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > > > > # undef END > > > > > > > # define END(name) \ > > > > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > - by the indirect call in the PLT. */ > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > > > > #endif > > > > > > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > -- > > > > > > > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > > > > > > > -- > > > > > > > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > > > > > > > -- > > > > > > > > Failure to adjust for daylight savings time. > > > > > > -- > > > > > > monitor VLF leakage > > > > -- > > > > Stale file handle (next time use Tupperware(tm)!) > > -- > > piezo-electric interference
ping On Tue, Jun 24, 2014 at 12:41:52PM +0200, Ondřej Bílka wrote: > ping > On Wed, Jun 04, 2014 at 02:47:54PM +0200, Ondřej Bílka wrote: > > ping > > On Sat, May 24, 2014 at 01:23:13AM +0200, Ondřej Bílka wrote: > > > ping > > > On Mon, May 12, 2014 at 02:00:11PM +0200, Ondřej Bílka wrote: > > > > ping > > > > On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > > > > > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > > > > > ping > > > > > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > > > > > ping > > > > > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > > > > > To make a strtok faster and improve performance in general we need to do one > > > > > > > > additional change. > > > > > > > > > > > > > > > > A comment: > > > > > > > > > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > by the indirect call in the PLT. */ > > > > > > > > > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > > > > > between these implementations is quite big, when haystack is empty a > > > > > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > > > > > table and difference only increases with size. That is much bigger than > > > > > > > > plt slowdown which is few cycles. > > > > > > > > > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > > > > > misprediction but my internal benchmark shown. > > > > > > > > > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > index 24f55e9..1b3e1aa 100644 > > > > > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > > > > > # undef END > > > > > > > > # define END(name) \ > > > > > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > - by the indirect call in the PLT. */ > > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > > > > > #endif > > > > > > > > - > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > > > > > #ifdef USE_AS_STRPBRK > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > index bf7308e..fde1e1e 100644 > > > > > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > > > > > # undef END > > > > > > > > # define END(name) \ > > > > > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > - by the indirect call in the PLT. */ > > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > > > > > #endif > > > > > > > > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > > > > > > > > > -- > > > > > > > > > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > > > > > > > > > -- > > > > > > > > > > Failure to adjust for daylight savings time. > > > > > > > > -- > > > > > > > > monitor VLF leakage > > > > > > -- > > > > > > Stale file handle (next time use Tupperware(tm)!) > > > > -- > > > > piezo-electric interference > > -- > > Hard drive sleeping. Let it wake up on it's own...
ping On Wed, Dec 10, 2014 at 03:39:31PM +0100, Ondřej Bílka wrote: > ping > On Tue, Jun 24, 2014 at 12:41:52PM +0200, Ondřej Bílka wrote: > > ping > > On Wed, Jun 04, 2014 at 02:47:54PM +0200, Ondřej Bílka wrote: > > > ping > > > On Sat, May 24, 2014 at 01:23:13AM +0200, Ondřej Bílka wrote: > > > > ping > > > > On Mon, May 12, 2014 at 02:00:11PM +0200, Ondřej Bílka wrote: > > > > > ping > > > > > On Sat, Apr 12, 2014 at 09:24:47PM +0200, Ondřej Bílka wrote: > > > > > > On Sat, Apr 05, 2014 at 04:48:41PM +0200, Ondřej Bílka wrote: > > > > > > > ping > > > > > > > On Thu, Mar 27, 2014 at 10:18:06PM +0100, Ondřej Bílka wrote: > > > > > > > > ping > > > > > > > > On Tue, Mar 18, 2014 at 11:01:38AM +0100, Ondřej Bílka wrote: > > > > > > > > > To make a strtok faster and improve performance in general we need to do one > > > > > > > > > additional change. > > > > > > > > > > > > > > > > > > A comment: > > > > > > > > > > > > > > > > > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > > > The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > > by the indirect call in the PLT. */ > > > > > > > > > > > > > > > > > > Does not make sense at all because nobody bothered to check it. Gap > > > > > > > > > between these implementations is quite big, when haystack is empty a > > > > > > > > > sse2 is around 40 cycles slower because it needs to populate a lookup > > > > > > > > > table and difference only increases with size. That is much bigger than > > > > > > > > > plt slowdown which is few cycles. > > > > > > > > > > > > > > > > > > Even benchtest show a gap which also may be reverse by branch > > > > > > > > > misprediction but my internal benchmark shown. > > > > > > > > > > > > > > > > > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > > > > > > > > > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > > > > > > > > > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > > > > > > > > > > > > > > > > > This patch also handles strpbrk which is implemented by including a > > > > > > > > > x86_64/multiarch/strcspn.S file. > > > > > > > > > > > > > > > > > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > > > > > > > > > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > > > > > > > > > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > > index 24f55e9..1b3e1aa 100644 > > > > > > > > > --- a/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > > +++ b/sysdeps/x86_64/multiarch/strcspn.S > > > > > > > > > @@ -65,14 +65,7 @@ END(STRCSPN) > > > > > > > > > # undef END > > > > > > > > > # define END(name) \ > > > > > > > > > cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 > > > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > > > -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. > > > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > > - by the indirect call in the PLT. */ > > > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > > > - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 > > > > > > > > > #endif > > > > > > > > > - > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > > > > > > > #ifdef USE_AS_STRPBRK > > > > > > > > > diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > > index bf7308e..fde1e1e 100644 > > > > > > > > > --- a/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > > +++ b/sysdeps/x86_64/multiarch/strspn.S > > > > > > > > > @@ -50,12 +50,6 @@ END(strspn) > > > > > > > > > # undef END > > > > > > > > > # define END(name) \ > > > > > > > > > cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 > > > > > > > > > -# undef libc_hidden_builtin_def > > > > > > > > > -/* It doesn't make sense to send libc-internal strspn calls through a PLT. > > > > > > > > > - The speedup we get from using SSE4.2 instruction is likely eaten away > > > > > > > > > - by the indirect call in the PLT. */ > > > > > > > > > -# define libc_hidden_builtin_def(name) \ > > > > > > > > > - .globl __GI_strspn; __GI_strspn = __strspn_sse2 > > > > > > > > > #endif > > > > > > > > > > > > > > > > > > #endif /* HAVE_SSE4_SUPPORT */ > > > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > > > Too many little pins on CPU confusing it, bend back and forth until 10-20% are neatly removed. Do _not_ leave metal bits visible! > > > > > > > > > > > > > > -- > > > > > > > > > > > > > > Look, buddy: Windows 3.1 IS A General Protection Fault. > > > > > > > > > > > > -- > > > > > > > > > > > > Failure to adjust for daylight savings time. > > > > > > > > > > -- > > > > > > > > > > monitor VLF leakage > > > > > > > > -- > > > > > > > > Stale file handle (next time use Tupperware(tm)!) > > > > > > -- > > > > > > piezo-electric interference > > > > -- > > > > Hard drive sleeping. Let it wake up on it's own... > > -- > > tachyon emissions overloading the system
On 18 Mar 2014 11:01, Ondřej Bílka wrote: > To make a strtok faster and improve performance in general we need to do one > additional change. > > A comment: > > /* It doesn't make sense to send libc-internal strcspn calls through a PLT. > The speedup we get from using SSE4.2 instruction is likely eaten away > by the indirect call in the PLT. */ > > Does not make sense at all because nobody bothered to check it. Gap > between these implementations is quite big, when haystack is empty a > sse2 is around 40 cycles slower because it needs to populate a lookup > table and difference only increases with size. That is much bigger than > plt slowdown which is few cycles. > > Even benchtest show a gap which also may be reverse by branch > misprediction but my internal benchmark shown. > > simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 > Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 > Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 > > This patch also handles strpbrk which is implemented by including a > x86_64/multiarch/strcspn.S file. > > * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. > * sysdeps/x86_64/multiarch/strcspn.S: Likewise. since H.J. wrote the code, he probably should be the one approving this change -mike
On Thu, Mar 5, 2015 at 6:03 PM, Mike Frysinger <vapier@gentoo.org> wrote: > On 18 Mar 2014 11:01, Ondřej Bílka wrote: >> To make a strtok faster and improve performance in general we need to do one >> additional change. >> >> A comment: >> >> /* It doesn't make sense to send libc-internal strcspn calls through a PLT. >> The speedup we get from using SSE4.2 instruction is likely eaten away >> by the indirect call in the PLT. */ >> >> Does not make sense at all because nobody bothered to check it. Gap >> between these implementations is quite big, when haystack is empty a >> sse2 is around 40 cycles slower because it needs to populate a lookup >> table and difference only increases with size. That is much bigger than >> plt slowdown which is few cycles. >> >> Even benchtest show a gap which also may be reverse by branch >> misprediction but my internal benchmark shown. >> >> simple_strspn stupid_strspn __strspn_sse42 __strspn_sse2 >> Length 0, alignment 0, acc len 6: 18.6562 35.2344 17.0469 61.6719 >> Length 6, alignment 0, acc len 6: 59.5469 72.5781 16.4219 73.625 >> >> This patch also handles strpbrk which is implemented by including a >> x86_64/multiarch/strcspn.S file. >> >> * sysdeps/x86_64/multiarch/strspn.S: Remove plt indirection. >> * sysdeps/x86_64/multiarch/strcspn.S: Likewise. > > since H.J. wrote the code, he probably should be the one approving this change > -mike Looks good to me. Please commit. Sorry for the long delay. Thanks.
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S index 24f55e9..1b3e1aa 100644 --- a/sysdeps/x86_64/multiarch/strcspn.S +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -65,14 +65,7 @@ END(STRCSPN) # undef END # define END(name) \ cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strcspn calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 #endif - #endif /* HAVE_SSE4_SUPPORT */ #ifdef USE_AS_STRPBRK diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S index bf7308e..fde1e1e 100644 --- a/sysdeps/x86_64/multiarch/strspn.S +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -50,12 +50,6 @@ END(strspn) # undef END # define END(name) \ cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 -# undef libc_hidden_builtin_def -/* It doesn't make sense to send libc-internal strspn calls through a PLT. - The speedup we get from using SSE4.2 instruction is likely eaten away - by the indirect call in the PLT. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_strspn; __GI_strspn = __strspn_sse2 #endif #endif /* HAVE_SSE4_SUPPORT */