Message ID | DB5PR08MB10306C82FE5593353F04E18E834D0@DB5PR08MB1030.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [v2] Improve strstr performance | expand |
On 30/06/2018 11:30, Wilco Dijkstra wrote: > Version 2 uses __strnlen to avoid namespace issues. > > Improve strstr performance. Strstr tends to be slow because it uses > many calls to memchr and a slow byte loop to scan for the next match. > Performance is significantly improved by using strnlen on larger blocks > and using strchr to search for the next matching character. strcasestr > can also use strnlen to scan ahead, and memmem can use memchr to check > for the next match. > > On the GLIBC bench tests the performance gains on Cortex-A72 are: > strstr: +25% > strcasestr: +4.3% > memmem: +18% > > On a 256KB dataset strstr performance improves by 67%, strcasestr by 47%. > > ChangeLog: > 2018-06-30 Wilco Dijkstra <wdijkstr@arm.com> > > * benchtests/bench-strcasestr.c: Rename __strnlen to strnlen. > * benchtests/bench-strstr.c: Likewise. > * string/memmem.c (FASTSEARCH): Define. > * string/str-two-way.h (two_way_short_needle): Minor cleanups. > Add support for FASTSEARCH. > * string/strcasestr.c (AVAILABLE): Use read-ahead __strnlen. > * string/strstr.c (AVAILABLE): Use read-ahead __strnlen. > (FASTSEARCH): Define. > * string/test-strcasestr.c: Rename __strnlen to strnlen. > * string/test-strstr.c: Likewise. LGTM thanks. > -- > diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c > index e6659ea79ec9b41427b8376f21b67dc326f60ec0..4337e0c18dbd5ed4a3743af51046da2dc6b6fe3a 100644 > --- a/benchtests/bench-strcasestr.c > +++ b/benchtests/bench-strcasestr.c > @@ -24,6 +24,7 @@ > #define STRCASESTR simple_strcasestr > #define NO_ALIAS > #define __strncasecmp strncasecmp > +#define __strnlen strnlen > #include "../string/strcasestr.c" > > > diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c > index c30cd1078586fe66c9177707ba603bb6ba0476a7..a31294e3c96d80a4fd61bb5b423a825fe54d3227 100644 > --- a/benchtests/bench-strstr.c > +++ b/benchtests/bench-strstr.c > @@ -23,6 +23,7 @@ > > #define STRSTR simple_strstr > #define libc_hidden_builtin_def(X) > +#define __strnlen strnlen > #include "../string/strstr.c" > > > diff --git a/string/memmem.c b/string/memmem.c > index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644 > --- a/string/memmem.c > +++ b/string/memmem.c > @@ -31,6 +31,7 @@ > > #define RETURN_TYPE void * > #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) > +#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) > #include "str-two-way.h" > > #undef memmem > diff --git a/string/str-two-way.h b/string/str-two-way.h > index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..523d946c59412e1f1f65b8ec3778553b83191952 100644 > --- a/string/str-two-way.h > +++ b/string/str-two-way.h > @@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, > } > else > { > - const unsigned char *phaystack = &haystack[suffix]; > + const unsigned char *phaystack; > /* The comparison always starts from needle[suffix], so cache it > and use an optimized first-character loop. */ > unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); > > -#if CHECK_EOL > - /* We start matching from the SUFFIX'th element, so make sure we > - don't hit '\0' before that. */ > - if (haystack_len < suffix + 1 > - && !AVAILABLE (haystack, haystack_len, 0, suffix + 1)) > - return NULL; > -#endif > - > /* The two halves of needle are distinct; no extra memory is > required, and any mismatch results in a maximal shift. */ > period = MAX (suffix, needle_len - suffix) + 1; > j = 0; > - while (1 > -#if !CHECK_EOL > - && AVAILABLE (haystack, haystack_len, j, needle_len) > -#endif > - ) > + while (AVAILABLE (haystack, haystack_len, j, needle_len)) > { > unsigned char haystack_char; > const unsigned char *pneedle; > > - /* TODO: The first-character loop can be sped up by adapting > - longword-at-a-time implementation of memchr/strchr. */ > - if (needle_suffix > + phaystack = &haystack[suffix + j]; > + > +#ifdef FASTSEARCH > + if (*phaystack++ != needle_suffix) > + { > + phaystack = FASTSEARCH (phaystack, needle_suffix, > + haystack_len - needle_len - j); > + if (phaystack == NULL) > + goto ret0; > + j = phaystack - &haystack[suffix]; > + phaystack++; > + } > +#else > + while (needle_suffix > != (haystack_char = CANON_ELEMENT (*phaystack++))) > { > RET0_IF_0 (haystack_char); > -#if !CHECK_EOL > +# if !CHECK_EOL > ++j; > -#endif > - continue; > + if (!AVAILABLE (haystack, haystack_len, j, needle_len)) > + goto ret0; > +# endif > } > > -#if CHECK_EOL > +# if CHECK_EOL > /* Calculate J if it wasn't kept up-to-date in the first-character > loop. */ > j = phaystack - &haystack[suffix] - 1; > +# endif > #endif > - > /* Scan for matches in right half. */ > i = suffix + 1; > pneedle = &needle[i]; > @@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, > } > ++i; > } > +#if CHECK_EOL > + /* Update minimal length of haystack. */ > + if (phaystack > haystack + haystack_len) > + haystack_len = phaystack - haystack; > +#endif > if (needle_len <= i) > { > /* Scan for matches in left half. */ > @@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, > } > else > j += i - suffix + 1; > - > -#if CHECK_EOL > - if (!AVAILABLE (haystack, haystack_len, j, needle_len)) > - break; > -#endif > - > - phaystack = &haystack[suffix + j]; > } > } > ret0: __attribute__ ((unused)) > diff --git a/string/strcasestr.c b/string/strcasestr.c > index 90ba189790481a58eb7627eb8c396530f988f985..5909fe3cdba88e476c7a989a020f3611bbfeb1de 100644 > --- a/string/strcasestr.c > +++ b/string/strcasestr.c > @@ -37,8 +37,8 @@ > /* Two-Way algorithm. */ > #define RETURN_TYPE char * > #define AVAILABLE(h, h_l, j, n_l) \ > - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ > - && ((h_l) = (j) + (n_l))) > + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ > + (j) + (n_l) <= (h_l))) > #define CHECK_EOL (1) > #define RET0_IF_0(a) if (!a) goto ret0 > #define CANON_ELEMENT(c) TOLOWER (c) > diff --git a/string/strstr.c b/string/strstr.c > index b3b5deb673758510616d32849bcf2fc15ce941cd..265e9f310ce507ce63740cc42d8ceea1d28dab01 100644 > --- a/string/strstr.c > +++ b/string/strstr.c > @@ -33,10 +33,11 @@ > > #define RETURN_TYPE char * > #define AVAILABLE(h, h_l, j, n_l) \ > - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ > - && ((h_l) = (j) + (n_l))) > + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ > + (j) + (n_l) <= (h_l))) > #define CHECK_EOL (1) > #define RET0_IF_0(a) if (!a) goto ret0 > +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) > #include "str-two-way.h" > > #undef strstr > diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c > index 2b0a38ea734974ec389ad206874081a8b67d8066..9b1088df54c64b82348c555e16d35cf967e19dd9 100644 > --- a/string/test-strcasestr.c > +++ b/string/test-strcasestr.c > @@ -25,6 +25,7 @@ > #define STRCASESTR simple_strcasestr > #define NO_ALIAS > #define __strncasecmp strncasecmp > +#define __strnlen strnlen > #include "strcasestr.c" > > > diff --git a/string/test-strstr.c b/string/test-strstr.c > index acf6ff8224608737701046a421faed2be9f52f68..8d99716ff39cc2c22ebebdacb5f30438f9b32ffc 100644 > --- a/string/test-strstr.c > +++ b/string/test-strstr.c > @@ -24,6 +24,7 @@ > > #define STRSTR simple_strstr > #define libc_hidden_builtin_def(arg) /* nothing */ > +#define __strnlen strnlen > #include "strstr.c" > > >
diff --git a/benchtests/bench-strcasestr.c b/benchtests/bench-strcasestr.c index e6659ea79ec9b41427b8376f21b67dc326f60ec0..4337e0c18dbd5ed4a3743af51046da2dc6b6fe3a 100644 --- a/benchtests/bench-strcasestr.c +++ b/benchtests/bench-strcasestr.c @@ -24,6 +24,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "../string/strcasestr.c" diff --git a/benchtests/bench-strstr.c b/benchtests/bench-strstr.c index c30cd1078586fe66c9177707ba603bb6ba0476a7..a31294e3c96d80a4fd61bb5b423a825fe54d3227 100644 --- a/benchtests/bench-strstr.c +++ b/benchtests/bench-strstr.c @@ -23,6 +23,7 @@ #define STRSTR simple_strstr #define libc_hidden_builtin_def(X) +#define __strnlen strnlen #include "../string/strstr.c" diff --git a/string/memmem.c b/string/memmem.c index c17e1cf6a63c0709df0c1c7a45410c5178b2876e..43efaa3fb718e7a22c3b4122c278720768644d0f 100644 --- a/string/memmem.c +++ b/string/memmem.c @@ -31,6 +31,7 @@ #define RETURN_TYPE void * #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l)) +#define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N)) #include "str-two-way.h" #undef memmem diff --git a/string/str-two-way.h b/string/str-two-way.h index cd2605857d9b5c432c3c2b1c5e35f55d8e9285b6..523d946c59412e1f1f65b8ec3778553b83191952 100644 --- a/string/str-two-way.h +++ b/string/str-two-way.h @@ -281,50 +281,50 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else { - const unsigned char *phaystack = &haystack[suffix]; + const unsigned char *phaystack; /* The comparison always starts from needle[suffix], so cache it and use an optimized first-character loop. */ unsigned char needle_suffix = CANON_ELEMENT (needle[suffix]); -#if CHECK_EOL - /* We start matching from the SUFFIX'th element, so make sure we - don't hit '\0' before that. */ - if (haystack_len < suffix + 1 - && !AVAILABLE (haystack, haystack_len, 0, suffix + 1)) - return NULL; -#endif - /* The two halves of needle are distinct; no extra memory is required, and any mismatch results in a maximal shift. */ period = MAX (suffix, needle_len - suffix) + 1; j = 0; - while (1 -#if !CHECK_EOL - && AVAILABLE (haystack, haystack_len, j, needle_len) -#endif - ) + while (AVAILABLE (haystack, haystack_len, j, needle_len)) { unsigned char haystack_char; const unsigned char *pneedle; - /* TODO: The first-character loop can be sped up by adapting - longword-at-a-time implementation of memchr/strchr. */ - if (needle_suffix + phaystack = &haystack[suffix + j]; + +#ifdef FASTSEARCH + if (*phaystack++ != needle_suffix) + { + phaystack = FASTSEARCH (phaystack, needle_suffix, + haystack_len - needle_len - j); + if (phaystack == NULL) + goto ret0; + j = phaystack - &haystack[suffix]; + phaystack++; + } +#else + while (needle_suffix != (haystack_char = CANON_ELEMENT (*phaystack++))) { RET0_IF_0 (haystack_char); -#if !CHECK_EOL +# if !CHECK_EOL ++j; -#endif - continue; + if (!AVAILABLE (haystack, haystack_len, j, needle_len)) + goto ret0; +# endif } -#if CHECK_EOL +# if CHECK_EOL /* Calculate J if it wasn't kept up-to-date in the first-character loop. */ j = phaystack - &haystack[suffix] - 1; +# endif #endif - /* Scan for matches in right half. */ i = suffix + 1; pneedle = &needle[i]; @@ -338,6 +338,11 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } ++i; } +#if CHECK_EOL + /* Update minimal length of haystack. */ + if (phaystack > haystack + haystack_len) + haystack_len = phaystack - haystack; +#endif if (needle_len <= i) { /* Scan for matches in left half. */ @@ -360,13 +365,6 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len, } else j += i - suffix + 1; - -#if CHECK_EOL - if (!AVAILABLE (haystack, haystack_len, j, needle_len)) - break; -#endif - - phaystack = &haystack[suffix + j]; } } ret0: __attribute__ ((unused)) diff --git a/string/strcasestr.c b/string/strcasestr.c index 90ba189790481a58eb7627eb8c396530f988f985..5909fe3cdba88e476c7a989a020f3611bbfeb1de 100644 --- a/string/strcasestr.c +++ b/string/strcasestr.c @@ -37,8 +37,8 @@ /* Two-Way algorithm. */ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 #define CANON_ELEMENT(c) TOLOWER (c) diff --git a/string/strstr.c b/string/strstr.c index b3b5deb673758510616d32849bcf2fc15ce941cd..265e9f310ce507ce63740cc42d8ceea1d28dab01 100644 --- a/string/strstr.c +++ b/string/strstr.c @@ -33,10 +33,11 @@ #define RETURN_TYPE char * #define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) + (((j) + (n_l) <= (h_l)) || ((h_l) += __strnlen ((void*)((h) + (h_l)), 512), \ + (j) + (n_l) <= (h_l))) #define CHECK_EOL (1) #define RET0_IF_0(a) if (!a) goto ret0 +#define FASTSEARCH(S,C,N) (void*) strchr ((void*)(S), (C)) #include "str-two-way.h" #undef strstr diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c index 2b0a38ea734974ec389ad206874081a8b67d8066..9b1088df54c64b82348c555e16d35cf967e19dd9 100644 --- a/string/test-strcasestr.c +++ b/string/test-strcasestr.c @@ -25,6 +25,7 @@ #define STRCASESTR simple_strcasestr #define NO_ALIAS #define __strncasecmp strncasecmp +#define __strnlen strnlen #include "strcasestr.c" diff --git a/string/test-strstr.c b/string/test-strstr.c index acf6ff8224608737701046a421faed2be9f52f68..8d99716ff39cc2c22ebebdacb5f30438f9b32ffc 100644 --- a/string/test-strstr.c +++ b/string/test-strstr.c @@ -24,6 +24,7 @@ #define STRSTR simple_strstr #define libc_hidden_builtin_def(arg) /* nothing */ +#define __strnlen strnlen #include "strstr.c"