diff mbox

[2/3] Optimize strchrnul with unrolling, better header and unaligned loads

Message ID 20150526180007.GB26817@domone
State New
Headers show

Commit Message

Ondřej Bílka May 26, 2015, 6 p.m. UTC
Now this is real optimization of strchrnul.
This should be faster thanks to unrolling.

Second improvement is for practical strings. Most of these have match in
less than 32 bytes so we should avoid loop there.

If unaligned loads are available they should be used, I found that one
bottleneck on first checks that was too often false because it loaded
just few bytes.

Are these architectures only ones? I know that x64 forgotten to add
header so there may be others.

sysdeps/m68k/m680x0/m68020/bits/string.h:#define _STRING_ARCH_unaligned 1
sysdeps/s390/bits/string.h:#define _STRING_ARCH_unaligned       1
sysdeps/x86/bits/string.h:#define _STRING_ARCH_unaligned        1


Also checking start byte-by-byte is ineffective a better is just mask
bytes before start. It relies now on fast ffs, if it isn't case there
are other ways to find that quickly (binary search or multiplication).

There are several details that would make it faster but less readable so
I omitted them now. One would be add goto to end of function as gcc
likes to duplicate identical tail five times instead just jumping there.

Could arch mainainers test this and report if its improvement?

	* string/strchrnul.c (STRCHRNUL): Optimize implementation.
diff mbox

Patch

diff --git a/string/strchrnul.c b/string/strchrnul.c
index 4986c5d..ea91195 100644
--- a/string/strchrnul.c
+++ b/string/strchrnul.c
@@ -21,9 +21,8 @@ 
    <http://www.gnu.org/licenses/>.  */
 
 #include <string.h>
-#include <memcopy.h>
-#include <stdlib.h>
-
+#include <libc-internal.h>
+#include <stdint.h>
 #undef __strchrnul
 #undef strchrnul
 
@@ -31,146 +30,144 @@ 
 # define STRCHRNUL __strchrnul
 #endif
 
-/* Find the first occurrence of C in S or the final NUL byte.  */
-#ifdef AS_STRCHRNUL
-static __always_inline
-#endif
-char *
-STRCHRNUL (s, c_in)
-     const char *s;
-     int c_in;
-{
-  const unsigned char *char_ptr;
-  const unsigned long int *longword_ptr;
-  unsigned long int longword, magic_bits, charmask;
-  unsigned char c;
-
-  c = (unsigned char) c_in;
-
-  /* Handle the first few characters by reading one character at a time.
-     Do this until CHAR_PTR is aligned on a longword boundary.  */
-  for (char_ptr = (const unsigned char *) s;
-       ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
-       ++char_ptr)
-    if (*char_ptr == c || *char_ptr == '\0')
-      return (void *) char_ptr;
+static const unsigned long int ones = (~0UL / 255); /* 0x0101...*/
+static const unsigned long int add = 127 * (~0UL / 255);
+static const unsigned long int high_bits = 128 * (~0UL / 255);
 
-  /* All these elucidatory comments refer to 4-byte longwords,
-     but the theory applies equally well to 8-byte longwords.  */
+/* Use vector arithmetic tricks. Idea is to take expression works on
+   unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+   Our expression is ((s + 127) ^ (~s)) & 128  
+   Now we evaluate this expression on each byte in parallel and on first 
+   nonzero byte our expression will have nonzero value. */
 
-  longword_ptr = (unsigned long int *) char_ptr;
+static always_inline
+unsigned long int 
+contains_zero (unsigned long int s)
+{
+  return ((s + add) ^ ~s) & high_bits;
+}
 
-  /* Bits 31, 24, 16, and 8 of this number are zero.  Call these bits
-     the "holes."  Note that there is a hole just to the left of
-     each byte, with an extra at the end:
+static always_inline
+int
+found_in_long_bytes(char *s, unsigned long int cmask, char **result)
+{
+  const unsigned long int *lptr = (const unsigned long int *) s;
+  unsigned long int mask = contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+  if (mask)
+    {
+      *result = s + ffsl (mask) / 8 - 1;
+      return 1;
+    }
+  else
+    return 0;
+}
 
-     bits:  01111110 11111110 11111110 11111111
-     bytes: AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
+#define LSIZE sizeof (unsigned long int)
+#define CROSS_PAGE(x, n) (((uintptr_t)x)%4096 >= 4096 - n)
 
-     The 1-bits make sure that carries propagate to the next 0-bit.
-     The 0-bits provide holes for carries to fall into.  */
-  switch (sizeof (longword))
+/* Find the first occurrence of C in S or the final NUL byte.  */
+#ifdef AS_STRCHR
+static always_inline
+#endif
+char *
+STRCHRNUL (const char *s_in, int c_in)
+{
+  unsigned long int mask;
+  const unsigned long int *lptr;
+  char *s = (char *) s_in;
+  unsigned char c = (unsigned char) c_in;
+  char *r;
+  unsigned long int cmask = c * ones;
+
+#if _STRING_ARCH_unaligned
+  /* We fetch 32 bytes while not crossing page boundary. 
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load 
+     but is slower when string starts just few 
+     bytes before 32 byte boundary. A tradeoff is that we rarely could 
+     fetch extra cache line without needing it but this optimization 
+     does pay for that. */
+  if (!CROSS_PAGE(s, 32))
     {
-    case 4: magic_bits = 0x7efefeffL; break;
-    case 8: magic_bits = ((0x7efefefeL << 16) << 16) | 0xfefefeffL; break;
-    default:
-      abort ();
+      if (found_in_long_bytes (s + 0 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r))
+        return r;
+      if (sizeof (unsigned long int) == 4)
+        {
+          if (found_in_long_bytes (s + 0 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r))
+            return r;
+        }
     }
+  else
+    {
+#endif
+  /* We need use aligned loads. For first load we read some bytes before 
+     start that we discard by shifting them down. */
+ 
+      char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      lptr = (const unsigned long int *) s_aligned;
+      mask = (contains_zero (*lptr) 
+              | contains_zero (*lptr ^ cmask))
+             >> (8 * (s_aligned - s));
+
+      if (mask)
+        return s + ffsl (mask) / 8 - 1;
+
+      if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r))
+        return r;
+#if _STRING_ARCH_unaligned
+    }
+#endif
+   /* Now we read enough bytes to start a loop.  */
 
-  /* Set up a longword, each of whose bytes is C.  */
-  charmask = c | (c << 8);
-  charmask |= charmask << 16;
-  if (sizeof (longword) > 4)
-    /* Do the shift in two steps to avoid a warning if long has 32 bits.  */
-    charmask |= (charmask << 16) << 16;
-  if (sizeof (longword) > 8)
-    abort ();
-
-  /* Instead of the traditional loop which tests each character,
-     we will test a longword at a time.  The tricky part is testing
-     if *any of the four* bytes in the longword in question are zero.  */
-  for (;;)
+  char *s_loop = PTR_ALIGN_DOWN (s, 4 * LSIZE);
+  do 
     {
-      /* We tentatively exit the loop if adding MAGIC_BITS to
-	 LONGWORD fails to change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of LONGWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.  If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24.  If one of bits 24-30 is set, there will be a carry
-	 into bit 31, so all of the hole bits will be changed.
-
-	 The one misfire occurs when bits 24-30 are clear and bit
-	 31 is set; in this case, the hole at bit 31 is not
-	 changed.  If we had access to the processor carry flag,
-	 we could close this loophole by putting the fourth hole
-	 at bit 32!
-
-	 So it ignores everything except 128's, when they're aligned
-	 properly.
-
-	 3) But wait!  Aren't we looking for C as well as zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-      longword = *longword_ptr++;
-
-      /* Add MAGIC_BITS to LONGWORD.  */
-      if ((((longword + magic_bits)
-
-	    /* Set those bits that were unchanged by the addition.  */
-	    ^ ~longword)
-
-	   /* Look at only the hole bits.  If any of the hole bits
-	      are unchanged, most likely one of the bytes was a
-	      zero.  */
-	   & ~magic_bits) != 0 ||
-
-	  /* That caught zeroes.  Now test for C.  */
-	  ((((longword ^ charmask) + magic_bits) ^ ~(longword ^ charmask))
-	   & ~magic_bits) != 0)
-	{
-	  /* Which of the bytes was C or zero?
-	     If none of them were, it was a misfire; continue the search.  */
-
-	  const unsigned char *cp = (const unsigned char *) (longword_ptr - 1);
-
-	  if (*cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (*++cp == c || *cp == '\0')
-	    return (char *) cp;
-	  if (sizeof (longword) > 4)
-	    {
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	      if (*++cp == c || *cp == '\0')
-		return (char *) cp;
-	    }
-	}
+      s_aligned += 4 * LSIZE;
+      lptr = (const unsigned long int *) (s_loop + 0 * LSIZE);
+      mask = contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+      lptr = (const unsigned long int *) (s_loop + 1 * LSIZE);
+      mask |= contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+      lptr = (const unsigned long int *) (s_loop + 2 * LSIZE);
+      mask |= contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+      lptr = (const unsigned long int *) (s_loop + 3 * LSIZE);
+      mask |= contains_zero (*lptr) | contains_zero (*lptr ^ cmask);
+
     }
+  while (!mask);
+ if (found_in_long_bytes (s_loop + 0 * LSIZE, cmask, &r))
+   return r;
+ if (found_in_long_bytes (s_loop + 1 * LSIZE, cmask, &r))
+   return r;
+ if (found_in_long_bytes (s_loop + 2 * LSIZE, cmask, &r))
+   return r;
+ found_in_long_bytes (s_loop + 3 * LSIZE, cmask, &r);
+ return r;
+}
+
+
+char *strchr(char *s, int c)
+{
+  char *r = __strchrnule(s,c);
+  return *r ? r : 0;
 
-  /* This should never happen.  */
-  return NULL;
 }
 
+#ifndef AS_STRCHR
 weak_alias (__strchrnul, strchrnul)
+#endif