diff mbox

[1/*,v2] Generic string function optimization: Add skeleton

Message ID 20150527090717.GA27814@domone
State New
Headers show

Commit Message

Ondřej Bílka May 27, 2015, 9:07 a.m. UTC
After testing I found that I added typo in conversion to skeleton. This
fixes it. Here is correct version.

	* string/common.h: New file.
	* string/skeleton.h: Likewise.
diff mbox

Patch

diff --git a/string/common.h b/string/common.h
new file mode 100644
index 0000000..3b239dd
--- /dev/null
+++ b/string/common.h
@@ -0,0 +1,35 @@ 
+#include <stdint.h>
+
+static const unsigned long int ones = (~0UL / 255); /* 0x0101...*/
+static const unsigned long int add = 127 * (~0UL / 255);
+static const unsigned long int high_bits = 128 * (~0UL / 255);
+
+/* Use vector arithmetic tricks. Idea is to take expression works on
+   unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+   Our expression is ((s + 127) & (~s)) & 128  
+   Now we evaluate this expression on each byte in parallel and on first 
+   nonzero byte our expression will have nonzero value. */
+
+static __always_inline
+unsigned long int 
+contains_zero (unsigned long int s)
+{
+  return (((s & add) + add) ^ high_bits) & high_bits & ~s;
+}
+
+#define LSIZE sizeof (unsigned long int)
+#define CROSS_PAGE(x, n) (((uintptr_t)x) % 4096 > 4096 - n)
+
+static __always_inline
+size_t
+first_nonzero_byte (unsigned long int u)
+{
+#ifdef FAST_FFS
+  return ffsl (u) / 8 - 1;
+#else
+  u = u ^ (u - 1);
+  u = u & ones;
+  u = u * ones;
+  return (u >> (8 * LSIZE - 8)) - 1;
+#endif
+}
diff --git a/string/skeleton.h b/string/skeleton.h
new file mode 100644
index 0000000..563e6e4
--- /dev/null
+++ b/string/skeleton.h
@@ -0,0 +1,145 @@ 
+/* Skeleton of generic string functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+#ifndef BOUND
+# define BOUND(x) 0
+#endif
+
+
+static __always_inline
+int
+found_in_long_bytes(char *s, unsigned long int cmask, char **result)
+{
+  const unsigned long int *lptr = (const unsigned long int *) s;
+  unsigned long int mask = EXPRESSION(*lptr, cmask);
+  if (mask)
+    {
+      *result = s + first_nonzero_byte (mask);
+      return 1;
+    }
+  else
+    return 0;
+}
+
+static __always_inline
+char *
+string_skeleton (const char *s_in, int c_in, char *end)
+{
+  unsigned long int mask;
+  const unsigned long int *lptr;
+  char *s = (char *) s_in;
+  unsigned char c = (unsigned char) c_in;
+  char *r;
+  unsigned long int cmask = c * ones;
+
+#if _STRING_ARCH_unaligned
+  /* We fetch 32 bytes while not crossing page boundary. 
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load 
+     but is slower when string starts just few 
+     bytes before 32 byte boundary. A tradeoff is that we rarely could 
+     fetch extra cache line without needing it but this optimization 
+     does pay for that. */
+  if (!CROSS_PAGE(s, 32))
+    {
+      if (found_in_long_bytes (s + 0 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r))
+        return r;
+      if (sizeof (unsigned long int) == 4)
+        {
+          if (found_in_long_bytes (s + 5 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 6 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 7 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 8 * LSIZE, cmask, &r))
+            return r;
+        }
+
+      if (BOUND (s + 32))
+        return NULL;
+    }
+  else
+    {
+#endif
+  /* We need use aligned loads. For first load we read some bytes before 
+     start that we discard by shifting them down. */
+ 
+      char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      lptr = (const unsigned long int *) s_aligned;
+      mask = (EXPRESSION (*lptr, cmask)) >> (8 * (s - s_aligned));
+
+      if (mask)
+        return s + first_nonzero_byte (mask);
+
+      if (BOUND (s_aligned + 1 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 1 * LSIZE, cmask, &r))
+        return r;
+      if (BOUND (s_aligned + 2 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 2 * LSIZE, cmask, &r))
+        return r;
+      if (BOUND (s_aligned + 3 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 3 * LSIZE, cmask, &r))
+        return r;
+      if (BOUND (s_aligned + 4 * LSIZE))
+        return NULL;
+#if _STRING_ARCH_unaligned
+    }
+#endif
+   /* Now we read enough bytes to start a loop.  */
+
+  char *s_loop = PTR_ALIGN_DOWN (s, 4 * LSIZE);
+  while (!BOUND (s_loop + 4 * LSIZE))
+    {
+      s_loop += 4 * LSIZE;
+      lptr = (const unsigned long int *) (s_loop + 0 * LSIZE);
+      mask = EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 1 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 2 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 3 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+
+      if (mask)
+        {
+          if (found_in_long_bytes (s_loop + 0 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s_loop + 1 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s_loop + 2 * LSIZE, cmask, &r))
+            return r;
+
+          return s_loop + 3 * LSIZE + first_nonzero_byte (mask);
+        }
+    }
+ return NULL;
+}