diff mbox

[v5,4*] Generic string search functions (strstr, strcasestr, memmem)

Message ID 20150601131146.GA22285@domone
State New
Headers show

Commit Message

Ondřej Bílka June 1, 2015, 1:11 p.m. UTC
And here is optimization of searching functions. I still use first pair
trick but realized that strcasestr could be more general. First I need
only tolower(n[0,1]) to be in ascii which happens almost always for
accented characters. I also realized that to prevent false positives you
dont need to check first pair of characters always, only when they lie
outside ascii.

I will try collect profile trace for benchmark, problem is that almost
nobody uses strcasestr. I found only glibc calls it in debug and bash at
one place of job control. If somebody has application that calls
frequently strcasestr I would be glad to see it.

 	* sysdeps/generic/string_vector_search.h: New file.
 	* string/strstr.c (STRSTR): Use skeleton.
 	* string/strstr.c (STRCASESTR): Likewise.
 	* string/strstr.c (__memmem): Likewise..
diff mbox

Patch

diff --git a/string/memmem.c b/string/memmem.c
index 8a81f65..2dc551a 100644
--- a/string/memmem.c
+++ b/string/memmem.c
@@ -35,6 +35,45 @@ 
 
 #undef memmem
 
+#include <string_vector.h>
+
+struct cmask
+{
+  unsigned long int n0, n1;
+  size_t needle_size;
+};
+#define CUSTOM_CMASK
+#define CMASK_PARAM struct cmask cmask
+
+#define LOOKAHEAD 1
+#define EXPRESSION_NOCARRY(x, cmask) \
+ ( bytes_equal_nocarry (BYTE (0), cmask.n0) 				\
+   &  bytes_equal_nocarry (BYTE (1), cmask.n1))
+
+/* We calculate mask, not look for first byte and carry could corrupt
+   following bytes. Only possible optimization would be use
+   contains_zero (x) as it must end there. */
+#define EXPRESSION(x, cmask) EXPRESSION_NOCARRY(x, cmask)
+
+static int
+check (char *s, char *n, unsigned long *rent, struct cmask cmask)
+{
+  /* First two characters were already checked by vector loop.  */
+  size_t i = 2;
+
+  while (i < cmask.needle_size && s[i] == n[i])
+    i++;
+
+  if (i == cmask.needle_size)
+    return 1;
+
+  rent += i + 10;
+  return 0;
+}
+
+#define CHECK_N
+#include <string_vector_search.h>
+
 /* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
    if NEEDLE_LEN is 0, otherwise NULL if NEEDLE is not found in
    HAYSTACK.  */
@@ -46,6 +85,7 @@  __memmem (const void *haystack_start, size_t haystack_len,
      not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
   const unsigned char *haystack = (const unsigned char *) haystack_start;
   const unsigned char *needle = (const unsigned char *) needle_start;
+  char *ret;
 
   if (needle_len == 0)
     /* The first occurrence of the empty string is deemed to occur at
@@ -57,6 +97,28 @@  __memmem (const void *haystack_start, size_t haystack_len,
   if (__glibc_unlikely (haystack_len < needle_len))
     return NULL;
 
+  unsigned char *n = (unsigned char *) needle;
+  if (__BYTE_ORDER == __LITTLE_ENDIAN && needle_len >= 2)
+    {
+      struct cmask cmask;
+      cmask.n0 = broadcast (n[0]);
+      cmask.n1 = broadcast (n[1]);
+      cmask.needle_size = needle_len;
+      size_t search_size = haystack_len - needle_len + 1;
+      if (vector_search ((char *) haystack, (char *) needle,
+		         cmask, search_size, &ret))
+        return (void *) ret;
+      else
+        {
+	  haystack_len -= (char *) ret - (char *) haystack;
+          haystack = (const unsigned char *) ret;
+
+          if (ret == NULL || haystack_len < needle_len)
+            return NULL;
+        }
+    }
+
+
   /* Use optimizations in memchr when possible, to reduce the search
      size of haystack using a linear algorithm with a smaller
      coefficient.  However, avoid memchr for long needles, since we
diff --git a/string/strcasestr.c b/string/strcasestr.c
index 400fab8..ae40930 100644
--- a/string/strcasestr.c
+++ b/string/strcasestr.c
@@ -57,6 +57,65 @@ 
 #define STRCASESTR __strcasestr
 #endif
 
+/* A initial fast vectorized loop looking for first digraph.  */
+
+#include <string_vector.h>
+
+struct cmask
+{
+  vector_int l0, u0, l1, u1;
+};
+#define CUSTOM_CMASK
+#define CMASK_PARAM struct cmask cmask
+
+  /* We use trick that for when character is in ascii and character
+     0 <= c <= 127 could be caselessly equal only one of characters
+     tolower (c), toupper (c) or a character x in range 128 <= x <= 255
+     As these are exactly characters have highest bit set to 1 we adjust
+     a expression from strstr and filter lower bits by anding with high_bits.
+   */
+
+#define LOOKAHEAD 1
+#define BYTE_EXPRESSION(x, l, u) \
+  ( bytes_equal_nocarry (x, l) 				\
+  | bytes_equal_nocarry (x, u)				\
+  | x)
+
+#define EXPRESSION_NOCARRY(x, cmask) (( \
+ ( BYTE_EXPRESSION (BYTE(0), cmask.l0, cmask.u0)		  \
+ & BYTE_EXPRESSION (BYTE(1), cmask.l1, cmask.u1) 		  \
+ ) | contains_zero_nocarry (BYTE (1))) & high_bits)
+
+/* We calculate mask, not look for first byte and carry could corrupt
+   following bytes. Only possible optimization would be use
+   contains_zero (x) as it must end there. */
+#define EXPRESSION(x, cmask) EXPRESSION_NOCARRY (x, cmask)
+
+static int
+check (char *s, char *n, unsigned long *rent, struct cmask cmask)
+{
+  size_t i = 0;
+
+  if (!s[1])
+    return 1;
+
+  /* First two characters match if they are in ascii.  */
+  if (((unsigned char*)s)[0] < 128 && ((unsigned char*)s)[1] < 128)
+    i = 2;
+
+  while (n[i] && tolower (s[i]) == tolower (n[i]))
+    i++;
+
+  if (!n[i])
+    return 1;
+
+  rent += i + 10;
+  return 0;
+}
+
+#include <string_vector_search.h>
+
+#include "../locale/localeinfo.h"
 
 /* Find the first occurrence of NEEDLE in HAYSTACK, using
    case-insensitive comparison.  This function gives unspecified
@@ -66,10 +125,33 @@  STRCASESTR (const char *haystack_start, const char *needle_start)
 {
   const char *haystack = haystack_start;
   const char *needle = needle_start;
+  char *ret;
   size_t needle_len; /* Length of NEEDLE.  */
   size_t haystack_len; /* Known minimum length of HAYSTACK.  */
   bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
 
+  __locale_t loc = _NL_CURRENT_LOCALE;
+  struct __locale_data *ctype = loc->__locales[LC_CTYPE];
+  int nonascii = ctype->values[_NL_ITEM_INDEX (_NL_CTYPE_NONASCII_CASE)].word;
+
+  unsigned char *n = (unsigned char *) needle;
+  if (__BYTE_ORDER == __LITTLE_ENDIAN && !nonascii && haystack[0] != '\0' &&
+      n[0] != '\0' && tolower(n[0]) < 128 && n[1] != '\0' && tolower(n[1]) < 128)
+    {
+      struct cmask cmask;
+      cmask.l0 = broadcast (tolower(n[0]));
+      cmask.u0 = broadcast (toupper(n[0]));
+      cmask.l1 = broadcast (tolower(n[1]));
+      cmask.u1 = broadcast (toupper(n[1]));
+
+      if (vector_search ((char *) haystack, (char *) needle, cmask,\
+			 0, &ret))
+        return (ret[1] != '\0') ? ret : NULL;
+      else
+        haystack = ret;
+    }
+
+
   /* Determine length of NEEDLE, and in the process, make sure
      HAYSTACK is at least as long (no point processing all of a long
      NEEDLE if HAYSTACK is too short).  */
diff --git a/string/strstr.c b/string/strstr.c
index 045e878..a881c7e 100644
--- a/string/strstr.c
+++ b/string/strstr.c
@@ -45,6 +45,48 @@ 
 #define STRSTR strstr
 #endif
 
+#include <string_vector.h>
+
+struct cmask
+{
+  vector_int n0, n1;
+};
+#define CUSTOM_CMASK
+#define CMASK_PARAM struct cmask cmask
+
+#define LOOKAHEAD 1
+#define EXPRESSION_NOCARRY(x, cmask) (\
+ ( bytes_equal_nocarry (BYTE (0), cmask.n0)				\
+   & bytes_equal_nocarry (BYTE (1), cmask.n1))				\
+ | contains_zero_nocarry (BYTE (1)))
+
+/* We calculate mask, not look for first byte and carry could corrupt
+   following bytes. Only possible optimization would be use
+   contains_zero (x) as it must end there. */
+#define EXPRESSION(x, cmask) EXPRESSION_NOCARRY(x, cmask)
+
+static int
+check (char *s, char *n, unsigned long *rent, struct cmask cmask)
+{
+  /* First two characters were already checked by vector loop.  */
+  size_t i = 2;
+  if (!s[1])
+    return 1;
+
+  while (n[i] && s[i] == n[i])
+    i++;
+
+  if (!n[i])
+    return 1;
+
+  rent += i + 10;
+  return 0;
+}
+
+#include <string_vector_search.h>
+
+
+
 /* Return the first occurrence of NEEDLE in HAYSTACK.  Return HAYSTACK
    if NEEDLE is empty, otherwise NULL if NEEDLE is not found in
    HAYSTACK.  */
@@ -53,10 +95,28 @@  STRSTR (const char *haystack_start, const char *needle_start)
 {
   const char *haystack = haystack_start;
   const char *needle = needle_start;
+  char *ret;
   size_t needle_len; /* Length of NEEDLE.  */
   size_t haystack_len; /* Known minimum length of HAYSTACK.  */
   bool ok = true; /* True if NEEDLE is prefix of HAYSTACK.  */
 
+  unsigned char *n = (unsigned char *) needle;
+  if (__BYTE_ORDER == __LITTLE_ENDIAN
+      && haystack[0] != '\0' && n[0] != '\0' && n[1] != '\0')
+    {
+      struct cmask cmask;
+      cmask.n0 =  broadcast (n[0]);
+      cmask.n1 =  broadcast (n[1]);
+
+      if (vector_search ((char *) haystack, (char *) needle, cmask,\
+			 0, &ret))
+        return ret[1] != '\0' ? ret : NULL;
+      else
+        haystack = ret;
+    }
+
+
+
   /* Determine length of NEEDLE, and in the process, make sure
      HAYSTACK is at least as long (no point processing all of a long
      NEEDLE if HAYSTACK is too short).  */
diff --git a/string/test-strcasestr.c b/string/test-strcasestr.c
index 489dc84..f8a7ee9 100644
--- a/string/test-strcasestr.c
+++ b/string/test-strcasestr.c
@@ -21,12 +21,7 @@ 
 #define TEST_NAME "strcasestr"
 #include "test-string.h"
 
-
-#define STRCASESTR simple_strcasestr
-#define NO_ALIAS
-#define __strncasecmp strncasecmp
-#include "strcasestr.c"
-
+#include <ctype.h>
 
 static char *
 stupid_strcasestr (const char *s1, const char *s2)
@@ -54,7 +49,6 @@  stupid_strcasestr (const char *s1, const char *s2)
 typedef char *(*proto_t) (const char *, const char *);
 
 IMPL (stupid_strcasestr, 0)
-IMPL (simple_strcasestr, 0)
 IMPL (strcasestr, 1)
 
 
diff --git a/sysdeps/generic/string_vector_search.h b/sysdeps/generic/string_vector_search.h
new file mode 100644
index 0000000..263fc25
--- /dev/null
+++ b/sysdeps/generic/string_vector_search.h
@@ -0,0 +1,89 @@ 
+/* Algorithm to select a faster string search implementation.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/*
+   For string search algorithms there are two ways how you could look
+   at performance.
+
+   From practical standpoint a naive algorithm is baseline.  Most theoretical
+   algorithms (BW, KMP, ...) are slower because they need to do expensive 
+   random access.  This makes prohibitive constant cost per character while naive
+   algorithm is linear on random inputs as most of time you don't match string.
+   Also potential gains are nonexistent, user supplies needle so you could be
+   happy when its at least 8 characters long and most haystacks are less than 64
+   bytes large.
+
+   On other hand we need to ensure a linear worst-case of our algorithm.
+
+   We do that with bit of accounting. We count number of comparisons and when
+   it exceeds a size of haystack times some constant we switch to two-way
+   algorithm with guaranteed linear time.
+
+   Main performance boost is gained by vectorizing naive algorithm checks.
+   We will check in parallel if first digraph matches. That should be quite rare,
+  in english most frequent digraph is th with frequency around 1%.
+
+   Then after each digraph found we face decision if to keep looking for
+   digraphs or switch to two-way algorithm. These are covered as common
+   problem in online algorithm setting: buy-or-rent problem.
+   A precomputation in two-way algorithm with needle size n takes
+   around same time as 20 character comparisons so in worst case a two-way
+   algorithm would be twenty times slower than naive for haystacks of size n+1.
+   A solution would be pay higher rent rate until it accumulates to buy cost.
+   Then we would in worst case be twice slower than selecting optimal
+   implementation from start.
+
+   That would work except it needs strlen (needle) which is unnecessary
+   in practice. To show that haystack cannot match needle it suffices to know
+   that there is no leading triplet from needle in haystack. As unless there
+   was a planted needle in haystack and false positive is unlikely we likely
+   don't have to inspect more than three or four characters from needle. Also
+   correct accounting takes time so we approximate cost based on number of
+   comparisons and vector searchs.
+
+ */
+
+#include <string_vector_skeleton.h>
+
+static int
+vector_search (char *haystack, char *needle, CMASK_PARAM,
+	       size_t check_n, char **ret)
+{
+  char *s = haystack;
+  char *end = s + check_n;
+  unsigned long rent = 0;
+  while (rent < 256 + (s - haystack))
+    {
+      s = string_skeleton (s, cmask, end - s);
+      if (s == NULL)
+	{
+	  *ret = s;
+	  return 1;
+	}
+      if (check (s, needle, &rent, cmask))
+	{
+	  *ret = s;
+	  return 1;
+	}
+      else
+	s++;
+    }
+  /* Superlinear behaviour detected, switch to two-way algorithm.  */
+  *ret = s;
+  return 0;
+}