Message ID | 20150701153515.GA31676@domone |
---|---|
State | New |
Headers | show |
Ping On Wed, Jul 01, 2015 at 05:35:15PM +0200, Ondřej Bílka wrote: > Hi, > > As I promised previously to improve sse2 strspn I found that I could > convince gcc to generate reasonable code. Its really needed to use goto, > otherwise gcc totally messes up a loop which will be 10% slower than > current. Generated assembly is relatively ok, gcc only increments two > registers by 4 instead one which doesn't affect performance much. > > So here is a generic strcspn/strpbrk implementation and I will send > patch to replace x64 code later. > > > Carlos as usual this is code where you need to make hard decision based > on actual profile. This implementation on gcc workload shows 30% > improvement. That is by exploiting property of workload that match in 4 > bytes is likely. If it isn't a performance will suffer as these checks > increase cost of subsequent characters. > > This is again case where its unavoidable that some patterns will become > five times slower. If accept has 128 characters and match is in second > character then we spend ages with byte-by-byte checks of accept. But if > match is in first character then strchr will find that quickly. Problem > is that 99.8% of call have accept less than 8 bytes. So adding just a > check if size exceeds 8 will harm performance. See following two graphs. > > http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_gcc/result.html > http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_rand/result.html > > I cannot avoid that without my generic string framework where I would > first to vectorized check of first 8 bytes and its dubious if that would > help 32bit architectures. For x64 I correct solution is use sse2 prolog, > hard part is tuning as these break even depending on size. For use in > sse2 I need to add LATE_CHECK as it when called from sse42 routine > accept size is greater than 16 so early heuristic doesn't make much > sense. > > Main gain versus current sse2 assembly is that I align s to 4 bytes so I > could read 4 bytes at start of loop before check. That helps mainly > older cpu to execute loads earlier, it improves performance by 50% on > core2 large inputs, but newer are better at speculative loads so > difference gets smaller until haswell that doesn't show a difference. > > Tested on x64 by including them in test-strcspn/strpbrk > > OK to commit? > > * string/strcspn.c(STRCSPN): Check membership by array lookup. > * string/strcspn.c: Include string/strpbrk.c > * string/test-strcspn.c (simple_strcspn): Use string version. > * string/test-strpbrk.c (simple_strpbrk): Use string version. > > --- > string/strcspn.c | 43 +----------------- > string/strpbrk.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----- > string/test-strcspn.c | 15 ++----- > string/test-strpbrk.c | 14 +----- > 4 files changed, 116 insertions(+), 76 deletions(-) > > diff --git a/string/strcspn.c b/string/strcspn.c > index 2694d2a..2a82dd2 100644 > --- a/string/strcspn.c > +++ b/string/strcspn.c > @@ -1,41 +1,2 @@ > -/* Copyright (C) 1991-2015 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <http://www.gnu.org/licenses/>. */ > - > -#include <string.h> > - > -#undef strcspn > - > -#ifndef STRCSPN > -# define STRCSPN strcspn > -#endif > - > -/* Return the length of the maximum initial segment of S > - which contains no characters from REJECT. */ > -size_t > -STRCSPN (const char *s, const char *reject) > -{ > - size_t count = 0; > - > - while (*s != '\0') > - if (strchr (reject, *s++) == NULL) > - ++count; > - else > - return count; > - > - return count; > -} > -libc_hidden_builtin_def (strcspn) > +#define AS_STRCSPN > +#include "strpbrk.c" > diff --git a/string/strpbrk.c b/string/strpbrk.c > index 4f1d9b7..62808da 100644 > --- a/string/strpbrk.c > +++ b/string/strpbrk.c > @@ -16,26 +16,124 @@ > <http://www.gnu.org/licenses/>. */ > > #include <string.h> > - > +#include <stdint.h> > #undef strpbrk > +#undef strcspn > + > + > +#ifdef AS_STRCSPN > +# ifndef STRPBRK > +# define STRPBRK strcspn > +# endif > +# define RETURN_TYPE size_t > +# define RETURN(c) return c > +#else > +# define RETURN_TYPE char * > +# define RETURN(c) return (char *) (s[c] != '\0' ? s + c : NULL) > +#endif > > #ifndef STRPBRK > #define STRPBRK strpbrk > #endif > > + > /* Find the first occurrence in S of any character in ACCEPT. */ > -char * > -STRPBRK (const char *s, const char *accept) > +RETURN_TYPE > +STRPBRK (const char *_s, const char *_accept) > { > - while (*s != '\0') > + unsigned char *s = (unsigned char *) _s; > + unsigned char *a = (unsigned char *) _accept; > + > +#ifndef LATE_CHECK > + /* We need to align s to 4 bytes. We do check now to avoid expensive table > + construction. */ > + do > + { > + if (s[0] == *a) > + RETURN(0); > + } > + while (*a++); > + a = (unsigned char *) _accept; > + > + /* We couldn't do these checks in one loop as gcc > + messes up register allocation. */ > + do > + { > + if (s[1] == *a) > + RETURN(1); > + } > + while (*a++); > + a = (unsigned char *) _accept; > + > + do > + { > + if (s[2] == *a) > + RETURN(2); > + } > + while (*a++); > + a = (unsigned char *) _accept; > + > + do > + { > + if (s[3] == *a) > + RETURN(3); > + } > + while (*a++); > + a = (unsigned char *) _accept; > + > +#endif > + > + unsigned char table[256]; > + memset (table, 0, 256); > + do > { > - const char *a = accept; > - while (*a != '\0') > - if (*a++ == *s) > - return (char *) s; > - ++s; > + table[*a] = 1; > } > + while (*a++); > + unsigned char s0, s1, s2, s3; > + size_t count = 0; > +#ifdef LATE_CHECK > + s0 = s[count + 0]; > + s1 = s[count + 1]; > + s2 = s[count + 2]; > + s3 = s[count + 3]; > + if (table[s0]) > + goto ret0; > + if (table[s1]) > + goto ret1; > + if (table[s2]) > + goto ret2; > + if (table[s3]) > + goto ret3; > > - return NULL; > +#endif > + > + count = 4 - ((uintptr_t) s) % 4; > + > + while (1) > + { > + s0 = s[count + 0]; > + s1 = s[count + 1]; > + s2 = s[count + 2]; > + s3 = s[count + 3]; > + if (table[s0]) > + goto ret0; > + if (table[s1]) > + goto ret1; > + if (table[s2]) > + goto ret2; > + if (table[s3]) > + goto ret3; > + count += 4; > + } > + ret3: > + count++; > + ret2: > + count++; > + ret1: > + count++; > + ret0: > + RETURN(count); > } > -libc_hidden_builtin_def (strpbrk) > + > +libc_hidden_builtin_def (STRPBRK) > diff --git a/string/test-strcspn.c b/string/test-strcspn.c > index b60a048..50a06e4 100644 > --- a/string/test-strcspn.c > +++ b/string/test-strcspn.c > @@ -31,18 +31,9 @@ IMPL (stupid_strcspn, 0) > IMPL (simple_strcspn, 0) > IMPL (strcspn, 1) > > -size_t > -simple_strcspn (const char *s, const char *rej) > -{ > - const char *r, *str = s; > - char c; > - > - while ((c = *s++) != '\0') > - for (r = rej; *r != '\0'; ++r) > - if (*r == c) > - return s - str - 1; > - return s - str - 1; > -} > +#define AS_STRCSPN > +#define STRPBRK simple_strcspn > +#include "string/strpbrk.c" > > size_t > stupid_strcspn (const char *s, const char *rej) > diff --git a/string/test-strpbrk.c b/string/test-strpbrk.c > index b4ac389..f389e9d 100644 > --- a/string/test-strpbrk.c > +++ b/string/test-strpbrk.c > @@ -32,18 +32,8 @@ IMPL (stupid_strpbrk, 0) > IMPL (simple_strpbrk, 0) > IMPL (strpbrk, 1) > > -char * > -simple_strpbrk (const char *s, const char *rej) > -{ > - const char *r; > - char c; > - > - while ((c = *s++) != '\0') > - for (r = rej; *r != '\0'; ++r) > - if (*r == c) > - return (char *) s - 1; > - return NULL; > -} > +#define STRPBRK simple_strpbrk > +#include "string/strpbrk.c" > > char * > stupid_strpbrk (const char *s, const char *rej) > -- > 1.8.4.rc3
ping On Wed, Aug 12, 2015 at 10:26:04PM +0200, Ondřej Bílka wrote: > Ping > On Wed, Jul 01, 2015 at 05:35:15PM +0200, Ondřej Bílka wrote: > > Hi, > > > > As I promised previously to improve sse2 strspn I found that I could > > convince gcc to generate reasonable code. Its really needed to use goto, > > otherwise gcc totally messes up a loop which will be 10% slower than > > current. Generated assembly is relatively ok, gcc only increments two > > registers by 4 instead one which doesn't affect performance much. > > > > So here is a generic strcspn/strpbrk implementation and I will send > > patch to replace x64 code later. > > > > > > Carlos as usual this is code where you need to make hard decision based > > on actual profile. This implementation on gcc workload shows 30% > > improvement. That is by exploiting property of workload that match in 4 > > bytes is likely. If it isn't a performance will suffer as these checks > > increase cost of subsequent characters. > > > > This is again case where its unavoidable that some patterns will become > > five times slower. If accept has 128 characters and match is in second > > character then we spend ages with byte-by-byte checks of accept. But if > > match is in first character then strchr will find that quickly. Problem > > is that 99.8% of call have accept less than 8 bytes. So adding just a > > check if size exceeds 8 will harm performance. See following two graphs. > > > > http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_gcc/result.html > > http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_rand/result.html > > > > I cannot avoid that without my generic string framework where I would > > first to vectorized check of first 8 bytes and its dubious if that would > > help 32bit architectures. For x64 I correct solution is use sse2 prolog, > > hard part is tuning as these break even depending on size. For use in > > sse2 I need to add LATE_CHECK as it when called from sse42 routine > > accept size is greater than 16 so early heuristic doesn't make much > > sense. > > > > Main gain versus current sse2 assembly is that I align s to 4 bytes so I > > could read 4 bytes at start of loop before check. That helps mainly > > older cpu to execute loads earlier, it improves performance by 50% on > > core2 large inputs, but newer are better at speculative loads so > > difference gets smaller until haswell that doesn't show a difference. > > > > Tested on x64 by including them in test-strcspn/strpbrk > > > > OK to commit? > > > > * string/strcspn.c(STRCSPN): Check membership by array lookup. > > * string/strcspn.c: Include string/strpbrk.c > > * string/test-strcspn.c (simple_strcspn): Use string version. > > * string/test-strpbrk.c (simple_strpbrk): Use string version. > > > > --- > > string/strcspn.c | 43 +----------------- > > string/strpbrk.c | 120 +++++++++++++++++++++++++++++++++++++++++++++----- > > string/test-strcspn.c | 15 ++----- > > string/test-strpbrk.c | 14 +----- > > 4 files changed, 116 insertions(+), 76 deletions(-) > > > > diff --git a/string/strcspn.c b/string/strcspn.c > > index 2694d2a..2a82dd2 100644 > > --- a/string/strcspn.c > > +++ b/string/strcspn.c > > @@ -1,41 +1,2 @@ > > -/* Copyright (C) 1991-2015 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - <http://www.gnu.org/licenses/>. */ > > - > > -#include <string.h> > > - > > -#undef strcspn > > - > > -#ifndef STRCSPN > > -# define STRCSPN strcspn > > -#endif > > - > > -/* Return the length of the maximum initial segment of S > > - which contains no characters from REJECT. */ > > -size_t > > -STRCSPN (const char *s, const char *reject) > > -{ > > - size_t count = 0; > > - > > - while (*s != '\0') > > - if (strchr (reject, *s++) == NULL) > > - ++count; > > - else > > - return count; > > - > > - return count; > > -} > > -libc_hidden_builtin_def (strcspn) > > +#define AS_STRCSPN > > +#include "strpbrk.c" > > diff --git a/string/strpbrk.c b/string/strpbrk.c > > index 4f1d9b7..62808da 100644 > > --- a/string/strpbrk.c > > +++ b/string/strpbrk.c > > @@ -16,26 +16,124 @@ > > <http://www.gnu.org/licenses/>. */ > > > > #include <string.h> > > - > > +#include <stdint.h> > > #undef strpbrk > > +#undef strcspn > > + > > + > > +#ifdef AS_STRCSPN > > +# ifndef STRPBRK > > +# define STRPBRK strcspn > > +# endif > > +# define RETURN_TYPE size_t > > +# define RETURN(c) return c > > +#else > > +# define RETURN_TYPE char * > > +# define RETURN(c) return (char *) (s[c] != '\0' ? s + c : NULL) > > +#endif > > > > #ifndef STRPBRK > > #define STRPBRK strpbrk > > #endif > > > > + > > /* Find the first occurrence in S of any character in ACCEPT. */ > > -char * > > -STRPBRK (const char *s, const char *accept) > > +RETURN_TYPE > > +STRPBRK (const char *_s, const char *_accept) > > { > > - while (*s != '\0') > > + unsigned char *s = (unsigned char *) _s; > > + unsigned char *a = (unsigned char *) _accept; > > + > > +#ifndef LATE_CHECK > > + /* We need to align s to 4 bytes. We do check now to avoid expensive table > > + construction. */ > > + do > > + { > > + if (s[0] == *a) > > + RETURN(0); > > + } > > + while (*a++); > > + a = (unsigned char *) _accept; > > + > > + /* We couldn't do these checks in one loop as gcc > > + messes up register allocation. */ > > + do > > + { > > + if (s[1] == *a) > > + RETURN(1); > > + } > > + while (*a++); > > + a = (unsigned char *) _accept; > > + > > + do > > + { > > + if (s[2] == *a) > > + RETURN(2); > > + } > > + while (*a++); > > + a = (unsigned char *) _accept; > > + > > + do > > + { > > + if (s[3] == *a) > > + RETURN(3); > > + } > > + while (*a++); > > + a = (unsigned char *) _accept; > > + > > +#endif > > + > > + unsigned char table[256]; > > + memset (table, 0, 256); > > + do > > { > > - const char *a = accept; > > - while (*a != '\0') > > - if (*a++ == *s) > > - return (char *) s; > > - ++s; > > + table[*a] = 1; > > } > > + while (*a++); > > + unsigned char s0, s1, s2, s3; > > + size_t count = 0; > > +#ifdef LATE_CHECK > > + s0 = s[count + 0]; > > + s1 = s[count + 1]; > > + s2 = s[count + 2]; > > + s3 = s[count + 3]; > > + if (table[s0]) > > + goto ret0; > > + if (table[s1]) > > + goto ret1; > > + if (table[s2]) > > + goto ret2; > > + if (table[s3]) > > + goto ret3; > > > > - return NULL; > > +#endif > > + > > + count = 4 - ((uintptr_t) s) % 4; > > + > > + while (1) > > + { > > + s0 = s[count + 0]; > > + s1 = s[count + 1]; > > + s2 = s[count + 2]; > > + s3 = s[count + 3]; > > + if (table[s0]) > > + goto ret0; > > + if (table[s1]) > > + goto ret1; > > + if (table[s2]) > > + goto ret2; > > + if (table[s3]) > > + goto ret3; > > + count += 4; > > + } > > + ret3: > > + count++; > > + ret2: > > + count++; > > + ret1: > > + count++; > > + ret0: > > + RETURN(count); > > } > > -libc_hidden_builtin_def (strpbrk) > > + > > +libc_hidden_builtin_def (STRPBRK) > > diff --git a/string/test-strcspn.c b/string/test-strcspn.c > > index b60a048..50a06e4 100644 > > --- a/string/test-strcspn.c > > +++ b/string/test-strcspn.c > > @@ -31,18 +31,9 @@ IMPL (stupid_strcspn, 0) > > IMPL (simple_strcspn, 0) > > IMPL (strcspn, 1) > > > > -size_t > > -simple_strcspn (const char *s, const char *rej) > > -{ > > - const char *r, *str = s; > > - char c; > > - > > - while ((c = *s++) != '\0') > > - for (r = rej; *r != '\0'; ++r) > > - if (*r == c) > > - return s - str - 1; > > - return s - str - 1; > > -} > > +#define AS_STRCSPN > > +#define STRPBRK simple_strcspn > > +#include "string/strpbrk.c" > > > > size_t > > stupid_strcspn (const char *s, const char *rej) > > diff --git a/string/test-strpbrk.c b/string/test-strpbrk.c > > index b4ac389..f389e9d 100644 > > --- a/string/test-strpbrk.c > > +++ b/string/test-strpbrk.c > > @@ -32,18 +32,8 @@ IMPL (stupid_strpbrk, 0) > > IMPL (simple_strpbrk, 0) > > IMPL (strpbrk, 1) > > > > -char * > > -simple_strpbrk (const char *s, const char *rej) > > -{ > > - const char *r; > > - char c; > > - > > - while ((c = *s++) != '\0') > > - for (r = rej; *r != '\0'; ++r) > > - if (*r == c) > > - return (char *) s - 1; > > - return NULL; > > -} > > +#define STRPBRK simple_strpbrk > > +#include "string/strpbrk.c" > > > > char * > > stupid_strpbrk (const char *s, const char *rej) > > -- > > 1.8.4.rc3 > > -- > > hardware stress fractures
diff --git a/string/strcspn.c b/string/strcspn.c index 2694d2a..2a82dd2 100644 --- a/string/strcspn.c +++ b/string/strcspn.c @@ -1,41 +1,2 @@ -/* Copyright (C) 1991-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <string.h> - -#undef strcspn - -#ifndef STRCSPN -# define STRCSPN strcspn -#endif - -/* Return the length of the maximum initial segment of S - which contains no characters from REJECT. */ -size_t -STRCSPN (const char *s, const char *reject) -{ - size_t count = 0; - - while (*s != '\0') - if (strchr (reject, *s++) == NULL) - ++count; - else - return count; - - return count; -} -libc_hidden_builtin_def (strcspn) +#define AS_STRCSPN +#include "strpbrk.c" diff --git a/string/strpbrk.c b/string/strpbrk.c index 4f1d9b7..62808da 100644 --- a/string/strpbrk.c +++ b/string/strpbrk.c @@ -16,26 +16,124 @@ <http://www.gnu.org/licenses/>. */ #include <string.h> - +#include <stdint.h> #undef strpbrk +#undef strcspn + + +#ifdef AS_STRCSPN +# ifndef STRPBRK +# define STRPBRK strcspn +# endif +# define RETURN_TYPE size_t +# define RETURN(c) return c +#else +# define RETURN_TYPE char * +# define RETURN(c) return (char *) (s[c] != '\0' ? s + c : NULL) +#endif #ifndef STRPBRK #define STRPBRK strpbrk #endif + /* Find the first occurrence in S of any character in ACCEPT. */ -char * -STRPBRK (const char *s, const char *accept) +RETURN_TYPE +STRPBRK (const char *_s, const char *_accept) { - while (*s != '\0') + unsigned char *s = (unsigned char *) _s; + unsigned char *a = (unsigned char *) _accept; + +#ifndef LATE_CHECK + /* We need to align s to 4 bytes. We do check now to avoid expensive table + construction. */ + do + { + if (s[0] == *a) + RETURN(0); + } + while (*a++); + a = (unsigned char *) _accept; + + /* We couldn't do these checks in one loop as gcc + messes up register allocation. */ + do + { + if (s[1] == *a) + RETURN(1); + } + while (*a++); + a = (unsigned char *) _accept; + + do + { + if (s[2] == *a) + RETURN(2); + } + while (*a++); + a = (unsigned char *) _accept; + + do + { + if (s[3] == *a) + RETURN(3); + } + while (*a++); + a = (unsigned char *) _accept; + +#endif + + unsigned char table[256]; + memset (table, 0, 256); + do { - const char *a = accept; - while (*a != '\0') - if (*a++ == *s) - return (char *) s; - ++s; + table[*a] = 1; } + while (*a++); + unsigned char s0, s1, s2, s3; + size_t count = 0; +#ifdef LATE_CHECK + s0 = s[count + 0]; + s1 = s[count + 1]; + s2 = s[count + 2]; + s3 = s[count + 3]; + if (table[s0]) + goto ret0; + if (table[s1]) + goto ret1; + if (table[s2]) + goto ret2; + if (table[s3]) + goto ret3; - return NULL; +#endif + + count = 4 - ((uintptr_t) s) % 4; + + while (1) + { + s0 = s[count + 0]; + s1 = s[count + 1]; + s2 = s[count + 2]; + s3 = s[count + 3]; + if (table[s0]) + goto ret0; + if (table[s1]) + goto ret1; + if (table[s2]) + goto ret2; + if (table[s3]) + goto ret3; + count += 4; + } + ret3: + count++; + ret2: + count++; + ret1: + count++; + ret0: + RETURN(count); } -libc_hidden_builtin_def (strpbrk) + +libc_hidden_builtin_def (STRPBRK) diff --git a/string/test-strcspn.c b/string/test-strcspn.c index b60a048..50a06e4 100644 --- a/string/test-strcspn.c +++ b/string/test-strcspn.c @@ -31,18 +31,9 @@ IMPL (stupid_strcspn, 0) IMPL (simple_strcspn, 0) IMPL (strcspn, 1) -size_t -simple_strcspn (const char *s, const char *rej) -{ - const char *r, *str = s; - char c; - - while ((c = *s++) != '\0') - for (r = rej; *r != '\0'; ++r) - if (*r == c) - return s - str - 1; - return s - str - 1; -} +#define AS_STRCSPN +#define STRPBRK simple_strcspn +#include "string/strpbrk.c" size_t stupid_strcspn (const char *s, const char *rej) diff --git a/string/test-strpbrk.c b/string/test-strpbrk.c index b4ac389..f389e9d 100644 --- a/string/test-strpbrk.c +++ b/string/test-strpbrk.c @@ -32,18 +32,8 @@ IMPL (stupid_strpbrk, 0) IMPL (simple_strpbrk, 0) IMPL (strpbrk, 1) -char * -simple_strpbrk (const char *s, const char *rej) -{ - const char *r; - char c; - - while ((c = *s++) != '\0') - for (r = rej; *r != '\0'; ++r) - if (*r == c) - return (char *) s - 1; - return NULL; -} +#define STRPBRK simple_strpbrk +#include "string/strpbrk.c" char * stupid_strpbrk (const char *s, const char *rej)