Message ID | AM3PR08MB0088F82AB469E058650B493283F60@AM3PR08MB0088.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Some comments below: On 08-01-2016 16:40, Wilco Dijkstra wrote: > Improve strcspn performance using a much faster algorithm. It is kept simple > so it works well on most targets. It is generally at least 10 times faster > than the existing implementation on bench-strcspn on a few AArch64 > implementations, and for some tests 100 times as fast (repeatedly calling > strchr on a small string is extremely slow...). In fact the string/bits/string2.h > inlines make no longer sense, as GCC already uses strlen if reject is an empty > string, strchrnul is 5 times as fast as __strcspn_c1, while __strcspn_c2 and > __strcspn_c3 are slower than the strcspn main loop for large strings (though reject > length 2-4 could be special cased in the future to gain even more performance). > > Built and tested on AArch64. OK for GLIBC 2.23? > > ChangeLog: > 2016-01-08 Wilco Dijkstra <wdijkstr@arm.com> > > * string/strcspn.c (strcspn): Rewrite function. > * string/bits/string2.h (strcspn): Use __builtin_strcspn. > (__strcspn_c1) Remove inline function. > (__strcspn_c2) Likewise. > (__strcspn_c3) Likewise. > > diff --git a/string/bits/string2.h b/string/bits/string2.h > index e18c67530ec78338c9591015f3d95c785de54726..d0926f1e7898a13852081f45d54bff5145751695 100644 > --- a/string/bits/string2.h > +++ b/string/bits/string2.h > @@ -199,62 +199,10 @@ extern void *__rawmemchr (const void *__s, int __c); > > /* Return the length of the initial segment of S which > consists entirely of characters not in REJECT. */ > -#if !defined _HAVE_STRING_ARCH_strcspn > -# ifndef _HAVE_STRING_ARCH_strcspn > -# if __GNUC_PREREQ (3, 2) > -# define strcspn(s, reject) \ > - __extension__ \ > - ({ char __r0, __r1, __r2; \ > - (__builtin_constant_p (reject) && __string2_1bptr_p (reject) \ > - ? ((__builtin_constant_p (s) && __string2_1bptr_p (s)) \ > - ? __builtin_strcspn (s, reject) \ > - : ((__r0 = ((const char *) (reject))[0], __r0 == '\0') \ > - ? strlen (s) \ > - : ((__r1 = ((const char *) (reject))[1], __r1 == '\0') \ > - ? __strcspn_c1 (s, __r0) \ > - : ((__r2 = ((const char *) (reject))[2], __r2 == '\0') \ > - ? __strcspn_c2 (s, __r0, __r1) \ > - : (((const char *) (reject))[3] == '\0' \ > - ? __strcspn_c3 (s, __r0, __r1, __r2) \ > - : __builtin_strcspn (s, reject)))))) \ > - : __builtin_strcspn (s, reject)); }) > -# endif > +#ifndef _HAVE_STRING_ARCH_strcspn > +# if __GNUC_PREREQ (3, 2) > +# define strcspn(s, reject) __builtin_strcspn (s, reject) > # endif > - > -__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject); > -__STRING_INLINE size_t > -__strcspn_c1 (const char *__s, int __reject) > -{ > - size_t __result = 0; > - while (__s[__result] != '\0' && __s[__result] != __reject) > - ++__result; > - return __result; > -} > - > -__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1, > - int __reject2); > -__STRING_INLINE size_t > -__strcspn_c2 (const char *__s, int __reject1, int __reject2) > -{ > - size_t __result = 0; > - while (__s[__result] != '\0' && __s[__result] != __reject1 > - && __s[__result] != __reject2) > - ++__result; > - return __result; > -} > - > -__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1, > - int __reject2, int __reject3); > -__STRING_INLINE size_t > -__strcspn_c3 (const char *__s, int __reject1, int __reject2, > - int __reject3) > -{ > - size_t __result = 0; > - while (__s[__result] != '\0' && __s[__result] != __reject1 > - && __s[__result] != __reject2 && __s[__result] != __reject3) > - ++__result; > - return __result; > -} > #endif > > > diff --git a/string/strcspn.c b/string/strcspn.c > index 2694d2ab0e807d0712d6b103dbdfd75ef164ebf1..cdb2df315c394889fe04b69980c63ea4ddbdb286 100644 > --- a/string/strcspn.c > +++ b/string/strcspn.c > @@ -26,16 +26,48 @@ > /* Return the length of the maximum initial segment of S > which contains no characters from REJECT. */ > size_t > -STRCSPN (const char *s, const char *reject) > +STRCSPN (const char *str, const char *reject) > { > - size_t count = 0; > + unsigned char table[256]; > + unsigned char *p, *s; > + unsigned int c0, c1, c2, c3; > + size_t count; > > - while (*s != '\0') > - if (strchr (reject, *s++) == NULL) > - ++count; > - else > - return count; > + if (reject[0] == '\0') > + return strlen (str); > + if (reject[1] == '\0') > + return __strchrnul (str, reject [0]) - str; I am not sure how often strcspn is used with empty or one char argument to validate this optimization in specific since it adds more branch cases for more general inputs. > > - return count; > + /* Use multiple small memsets to enable inlining on most targets. */ > + p = memset (table, 0, 64); > + memset (p + 64, 0, 64); > + memset (p + 128, 0, 64); > + memset (p + 192, 0, 64); It is unfortunate we need to use this to force inline instead to let the compiler handle it directly (and also simplifying the code by using c99 initializers). I noted x86_64 does no inline, although aarch64 and powerpc64le calls memset. How bad is avoiding this explicit calls now and work on compiler side to detect this aligned memset? > + > + s = (unsigned char*) reject; > + do > + p[c0 = *s++] = 1; > + while (c0); > + > + s = (unsigned char*) str; > + if (p[s[0]]) return 0; > + if (p[s[1]]) return 1; > + if (p[s[2]]) return 2; > + if (p[s[3]]) return 3; > + > + s = (unsigned char *) ((size_t)s & ~3); > + > + do > + { > + s += 4; > + c0 = p[s[0]]; > + c1 = p[s[1]]; > + c2 = p[s[2]]; > + c3 = p[s[3]]; > + } > + while ((c0 | c1 | c2 | c3) == 0); > + > + count = s - (unsigned char *) str; > + return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3; > } > libc_hidden_builtin_def (strcspn) >
Adhemerval Zanella Netto - Jan. 8, 2016, 8:05 p.m. wrote: > > + if (reject[0] == '\0') > > + return strlen (str); > > + if (reject[1] == '\0') > > + return __strchrnul (str, reject [0]) - str; > > I am not sure how often strcspn is used with empty or one char argument to > validate this optimization in specific since it adds more branch cases for > more general inputs. An empty string is extremely unlikely, however one and two characters seem to occur frequently (grep the GLIC sources for str(c)spn/strpbrk). My goal was to get rid of the odd inlines in the headers and enable the generic C implementation to beat the special cases by a good margin. Compared to the overhead of the initialization of the table, these extra checks cost very little (and once you check for a single-character string, you also need to check for an empty string). > > - return count; > > + /* Use multiple small memsets to enable inlining on most targets. */ > > + p = memset (table, 0, 64); > > + memset (p + 64, 0, 64); > > + memset (p + 128, 0, 64); > > + memset (p + 192, 0, 64); > > It is unfortunate we need to use this to force inline instead to let the > compiler handle it directly (and also simplifying the code by using > c99 initializers). I noted x86_64 does no inline, although aarch64 and > powerpc64le calls memset. How bad is avoiding this explicit calls now > and work on compiler side to detect this aligned memset? Yes but unfortunately inlining of memset is essential to get reasonable performance on small sizes. Eg. for sizes 30-60 the overhead of not inlining is 25-30% on Cortex-A57. We could maybe add a --param max-inline-memset=N option to a future GCC for building GLIBC (or just these files), however this doesn't help when GLIBC is built using any current GCC versions. Another possibility might be to write a loop with stores of size_t and build with a huge value for max-completely-peeled-insns. Or just give up and use macros to write out all stores explicitly... Wilco
diff --git a/string/bits/string2.h b/string/bits/string2.h index e18c67530ec78338c9591015f3d95c785de54726..d0926f1e7898a13852081f45d54bff5145751695 100644 --- a/string/bits/string2.h +++ b/string/bits/string2.h @@ -199,62 +199,10 @@ extern void *__rawmemchr (const void *__s, int __c); /* Return the length of the initial segment of S which consists entirely of characters not in REJECT. */ -#if !defined _HAVE_STRING_ARCH_strcspn -# ifndef _HAVE_STRING_ARCH_strcspn -# if __GNUC_PREREQ (3, 2) -# define strcspn(s, reject) \ - __extension__ \ - ({ char __r0, __r1, __r2; \ - (__builtin_constant_p (reject) && __string2_1bptr_p (reject) \ - ? ((__builtin_constant_p (s) && __string2_1bptr_p (s)) \ - ? __builtin_strcspn (s, reject) \ - : ((__r0 = ((const char *) (reject))[0], __r0 == '\0') \ - ? strlen (s) \ - : ((__r1 = ((const char *) (reject))[1], __r1 == '\0') \ - ? __strcspn_c1 (s, __r0) \ - : ((__r2 = ((const char *) (reject))[2], __r2 == '\0') \ - ? __strcspn_c2 (s, __r0, __r1) \ - : (((const char *) (reject))[3] == '\0' \ - ? __strcspn_c3 (s, __r0, __r1, __r2) \ - : __builtin_strcspn (s, reject)))))) \ - : __builtin_strcspn (s, reject)); }) -# endif +#ifndef _HAVE_STRING_ARCH_strcspn +# if __GNUC_PREREQ (3, 2) +# define strcspn(s, reject) __builtin_strcspn (s, reject) # endif - -__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject); -__STRING_INLINE size_t -__strcspn_c1 (const char *__s, int __reject) -{ - size_t __result = 0; - while (__s[__result] != '\0' && __s[__result] != __reject) - ++__result; - return __result; -} - -__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1, - int __reject2); -__STRING_INLINE size_t -__strcspn_c2 (const char *__s, int __reject1, int __reject2) -{ - size_t __result = 0; - while (__s[__result] != '\0' && __s[__result] != __reject1 - && __s[__result] != __reject2) - ++__result; - return __result; -} - -__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1, - int __reject2, int __reject3); -__STRING_INLINE size_t -__strcspn_c3 (const char *__s, int __reject1, int __reject2, - int __reject3) -{ - size_t __result = 0; - while (__s[__result] != '\0' && __s[__result] != __reject1 - && __s[__result] != __reject2 && __s[__result] != __reject3) - ++__result; - return __result; -} #endif diff --git a/string/strcspn.c b/string/strcspn.c index 2694d2ab0e807d0712d6b103dbdfd75ef164ebf1..cdb2df315c394889fe04b69980c63ea4ddbdb286 100644 --- a/string/strcspn.c +++ b/string/strcspn.c @@ -26,16 +26,48 @@ /* Return the length of the maximum initial segment of S which contains no characters from REJECT. */ size_t -STRCSPN (const char *s, const char *reject) +STRCSPN (const char *str, const char *reject) { - size_t count = 0; + unsigned char table[256]; + unsigned char *p, *s; + unsigned int c0, c1, c2, c3; + size_t count; - while (*s != '\0') - if (strchr (reject, *s++) == NULL) - ++count; - else - return count; + if (reject[0] == '\0') + return strlen (str); + if (reject[1] == '\0') + return __strchrnul (str, reject [0]) - str; - return count; + /* Use multiple small memsets to enable inlining on most targets. */ + p = memset (table, 0, 64); + memset (p + 64, 0, 64); + memset (p + 128, 0, 64); + memset (p + 192, 0, 64); + + s = (unsigned char*) reject; + do + p[c0 = *s++] = 1; + while (c0); + + s = (unsigned char*) str; + if (p[s[0]]) return 0; + if (p[s[1]]) return 1; + if (p[s[2]]) return 2; + if (p[s[3]]) return 3; + + s = (unsigned char *) ((size_t)s & ~3); + + do + { + s += 4; + c0 = p[s[0]]; + c1 = p[s[1]]; + c2 = p[s[2]]; + c3 = p[s[3]]; + } + while ((c0 | c1 | c2 | c3) == 0); + + count = s - (unsigned char *) str; + return (c0 | c1) != 0 ? count - c0 + 1 : count - c2 + 3; } libc_hidden_builtin_def (strcspn)