diff mbox

[v5] Generic string skeleton

Message ID 20150531190013.GA8530@domone
State New
Headers show

Commit Message

Ondřej Bílka May 31, 2015, 7 p.m. UTC
Here is next iteration of skeleton.
It still has primitives inline, not in separate directories selected by
benchmark.

I also simplified handling of strn* functions than in previous version.

For selection of tp_vector we could do same benchmarking, try say
strlen benchmark with int, long and with long long and select one that has
highest troughput per byte.

	* sysdeps/generic/string_vector.h: New file.
	* sysdeps/generic/string_vector_skeleton.h: Likewise.
diff mbox

Patch

diff --git a/sysdeps/generic/string_vector.h b/sysdeps/generic/string_vector.h
new file mode 100644
index 0000000..c55c13d
--- /dev/null
+++ b/sysdeps/generic/string_vector.h
@@ -0,0 +1,229 @@ 
+/* Vectorized arithmetic used in string functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+
+/* This is intended for optimized string functions by using vectorization.
+
+   Main idea is simple. Most string functions can be described as searching
+   for first byte that satisfies certain expression followed by
+   simple output conversion. For example in strlen we look for first byte x
+   where x == 0 followed by subtracting pointer from start to get length.
+
+   When you have expression you use skeleton to execute it in parallel for
+   eigth bytes at once which will give you considerable speedup.
+
+   You need to make expression from primitives that allow vectorization.
+
+   Bitwise arithmetic (&,|,^,~) is allowed. For most tricks you also need
+   to do addition and subtraction where you must be more careful.
+   If you could ensure that your expression don't overflows then you can
+   use it as well. However expressions that could overflow are considerably
+   faster and you can use them when you are bit careful. When you only want
+   to find first byte where expression is true then you most of time
+   don't care that on success it could corrupt following bytes. However you
+   can't overflow on failure. You need to supply two versions of expression,
+   EXPRESSION macro that can overflow on success and EXPRESSION_NOCARRY that
+   can't.
+
+   Use vector arithmetic tricks. Idea is to take expression works on
+   unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+   Our expression is ((s - 1) & (~s)) & 128
+   Now we evaluate this expression on each byte in parallel and on first
+   nonzero byte our expression will have nonzero value.
+
+   We need to provide version of expression that doesn't cause carry
+   propagation and opperations could be done in parallel. However its
+   not needed on little endian architectures as we end on first byte
+   that succeeds and we don't care that next ones could be corrupted.
+
+   Then you could use premade predicates. There are contains_zero(x) that
+   returns 128 when x is zero, 0 otherwise and bytes_equal(x, y) that
+   returns 128 when x == y, zero otherwise, these come with variants that
+   don't cause overflow.
+
+   For performance architecture with hardware support should redefine
+   primitives. Most often one wants to supply its own first_nonzero_byte
+   and define CUSTOM_FIRST_NONZERO_BYTE to avoid defining default one.
+
+   Others are contains_zero and bytes_equal that need to be redefined
+   along with their nocarry counterparts.
+
+   Having hardware fast bytewise minimum is game changer as it allows
+   considerable speedup. However it would need to create separate skeletons
+   as main benefit is in faster aggregation.
+
+   After that there comes tuning as there are several variables that
+   affect performance like number of times unrolled. These could be
+   automated by running with different options versus dryrun profile
+   trace and selecting best one.
+ */
+
+#ifndef VECTOR_INT
+# define VECTOR_INT unsigned long int
+#endif
+
+#if _STRING_ARCH_unaligned
+# define UNALIGNED_HEADER_UNROLL 4
+#else
+# define UNALIGNED_HEADER_UNROLL 0
+#endif
+#define ALIGNED_HEADER_UNROLL 4
+#define LOOP_UNROLL 4
+
+typedef VECTOR_INT vector_int;
+
+static const vector_int ones = ((~(vector_int) 0) / 255); /* 0x0101...*/
+static const vector_int add = 127 * ((~(vector_int) 0) / 255);
+static const vector_int high_bits = 128 * ((~(vector_int) 0) / 255);
+
+
+#define LSIZE sizeof (vector_int)
+#ifdef MINIMAL_PAGE_SIZE
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % MINIMAL_PAGE_SIZE > MINIMAL_PAGE_SIZE - n)
+#else
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % 4096 > 4096 - n)
+#endif
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define SHIFT_BYTES(x, n) ((x) << (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) >> (8 * (n)))
+#else
+# define SHIFT_BYTES(x, n) ((x) >> (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) << (8 * (n)))
+#endif
+
+/* Sets n first bytes to zero.  */
+static __always_inline
+vector_int
+forget_bytes (vector_int v, int n)
+{
+  if (n < 0)
+    return n;
+  /* This check could be deleted when shift does zeroing.  */
+  if (n > LSIZE)
+    return 0;
+  return SHIFT_BYTES_UP (SHIFT_BYTES (v, n), n);
+}
+
+
+/* Load vector. Needs to be macro for cast.
+   While for LOAD(x) needs to be aligned for LOADU it dont have to.
+   Unaligned loads are emulated on platforms that dont support it.  */
+
+#define LOAD(x) (*((vector_int *) (x)))
+#if _STRING_ARCH_unaligned == 0
+/* Here we could combine shifts if architecture sets x << 64 to zero.  */
+
+# define LOADU(x) ({ \
+		     char *aligned = PTR_ALIGN_DOWN ((char *) (x), LSIZE);                  \
+		     unsigned int align = ((char *) (x)) - aligned;                         \
+		     (SHIFT_BYTES (SHIFT_BYTES (LOAD (aligned), LSIZE - 1 - align), 1)      \
+		      | SHIFT_BYTES_UP (LOAD (aligned + LSIZE), align));                    \
+		   })
+
+#else
+# define LOADU(x) LOAD (x)
+#endif
+
+#ifndef CUSTOM_BROADCAST
+static __always_inline
+vector_int
+broadcast (unsigned char c)
+{
+  return ones * c;
+}
+#endif
+
+#ifndef CUSTOM_CONTAINS_ZERO
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+/* A possible question is how fast is byteswap on big endian
+   architectures. If it can be done withing cycle it migth be
+   profitable to emulate little endian there by overriding LOAD and LOADU.  */
+
+static __always_inline
+vector_int
+contains_zero (vector_int s)
+{
+  return (s - ones) & ~s & high_bits;
+}
+# else
+#  define contains_zero contains_zero_nocarry
+# endif
+
+static __always_inline
+vector_int
+contains_zero_nocarry (vector_int s)
+{
+  return (((s | high_bits) - ones) ^ high_bits) & ~s & high_bits;
+}
+#endif
+
+#ifndef CUSTOM_BYTES_EQUAL
+static __always_inline
+vector_int
+bytes_equal (vector_int x, vector_int y)
+{
+  return contains_zero (x ^ y);
+}
+
+static __always_inline
+vector_int
+bytes_equal_nocarry (vector_int x, vector_int y)
+{
+  return contains_zero_nocarry (x ^ y);
+}
+#endif
+
+/*
+  When you have hardware ctz/clz its probably best bet. However
+  for softare emulation you could get better than generic one as you
+  dont need to consider each bit, just highest bits in byte which can be
+  calculated more effectively.
+ */
+#ifndef CUSTOM_FIRST_NONZERO_BYTE
+static __always_inline
+size_t
+first_nonzero_byte (vector_int u)
+{
+# if __BYTE_ORDER == __BIG_ENDIAN
+#  ifdef NEED_BITWISE
+  u = u | (u >> 1);
+  u = u | (u >> 2);
+  u = u | (u >> 4);
+#  else
+  u = u >> 7;
+#  endif
+  u = u | (u >> 8);
+  u = u | (u >> 16);
+  u = u | (u >> 32);
+#  ifdef NEED_BITWISE
+  u = u & ones;
+#  endif
+  u = u * ones;
+  return 8 - (u >> (8 * LSIZE - 8));
+
+# else
+  /* Note that this works also in bitwise case.  */
+  u = u ^ (u - 1);
+  u = u & ones;
+  u = u * ones;
+  return (u >> (8 * LSIZE - 8)) - 1;
+# endif
+}
+#endif
diff --git a/sysdeps/generic/string_vector_skeleton.h b/sysdeps/generic/string_vector_skeleton.h
new file mode 100644
index 0000000..346f0ed
--- /dev/null
+++ b/sysdeps/generic/string_vector_skeleton.h
@@ -0,0 +1,261 @@ 
+/* Skeleton of generic string functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+
+/* On high endian an positive could cause false positive in previous byte.  */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+#ifdef CUSTOM_CMASK
+# define CMASK_PARAM_MASK CMASK_PARAM
+#else
+# define CMASK_PARAM int c_in
+# define CMASK_PARAM_MASK vector_int cmask
+#endif
+
+#ifdef LOOKAHEAD
+# define BYTE(n) \
+  (SHIFT_BYTES (SHIFT_BYTES (previous, LSIZE - (LOOKAHEAD - n) - 1), 1)  \
+   | SHIFT_BYTES_UP (v, LOOKAHEAD - n))
+#endif
+
+#ifdef LOOKAHEAD
+# define _LOOKAHEAD LOOKAHEAD
+#else
+# define _LOOKAHEAD 0
+#endif
+
+#ifdef CHECK_N
+# define _CHECK_N(x) x
+#else
+# define _CHECK_N(x) 0
+#endif
+
+#define LOOP_SIZE (LOOP_UNROLL * LSIZE)
+
+#define RETURN(x, mask) \
+  do 								  \
+    { 								  \
+      char *ret = x + first_nonzero_byte (mask); 		  \
+      return (_CHECK_N (n <= ret - s)) ? NULL : ret - _LOOKAHEAD; \
+    } 								  \
+  while (0)
+
+static __always_inline
+char *
+string_skeleton (const char *s_in, CMASK_PARAM, size_t n)
+{
+  vector_int mask;
+  char *s = (char *) s_in;
+  vector_int v = 0;
+  vector_int __attribute__ ((unused)) previous = 0;
+#ifndef CUSTOM_CMASK
+  unsigned char c = (unsigned char) c_in;
+  vector_int __attribute__ ((unused)) cmask = BROADCAST (c);
+#endif
+
+  if (_CHECK_N(n == 0))
+    return NULL;
+
+  /* We fetch 32 bytes while not crossing page boundary.
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load
+     but is slower when string starts just few
+     bytes before 32 byte boundary. A tradeoff is that we rarely could
+     fetch extra cache line without needing it but this optimization
+     does pay for that. */
+
+  if (!CROSS_PAGE (s, LSIZE * UNALIGNED_HEADER_UNROLL)
+      && UNALIGNED_HEADER_UNROLL > 0)
+    {
+#define UNALIGNED_HEADER_CHECK(i) \
+  if (UNALIGNED_HEADER_UNROLL >= i)                                      \
+    {                                                                    \
+      previous = v;                                                      \
+      v = LOADU (s + (i - 1) * LSIZE);                                   \
+      mask = EXPRESSION (v, cmask);                                      \
+      if (i == 1)                                                        \
+	mask = forget_bytes (mask, _LOOKAHEAD);                          \
+      if (mask)                                                          \
+	RETURN (s + (i - 1) * LSIZE, mask);				 \
+    }
+
+      UNALIGNED_HEADER_CHECK (1);
+      UNALIGNED_HEADER_CHECK (2);
+      UNALIGNED_HEADER_CHECK (3);
+      UNALIGNED_HEADER_CHECK (4);
+      UNALIGNED_HEADER_CHECK (5);
+      UNALIGNED_HEADER_CHECK (6);
+      UNALIGNED_HEADER_CHECK (7);
+      UNALIGNED_HEADER_CHECK (8);
+
+      if (_CHECK_N (n <= LSIZE * UNALIGNED_HEADER_UNROLL))
+	return NULL;
+    }
+  else
+    {
+      /* We need use aligned loads. For first load we read some bytes before
+         start that we discard by shifting them down. */
+
+      char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      v = LOAD (s_aligned);
+      /* We need be careful here as bytes before start can corrupt it.  */
+      mask = SHIFT_BYTES ((EXPRESSION_NOCARRY (v, cmask)), s - s_aligned);
+      mask = forget_bytes (mask, _LOOKAHEAD);
+      if (mask)
+        RETURN (s, mask);
+
+      /* When lookahead is i we need to ignore matches in first i bytes as
+         false positives. For lookahead 2 or more it could cross word
+         boundary and we need to mask these.
+
+         We need to check for crossing end of string after each iteration
+         as each one could cross page boundary. Note that we don't have
+         to make check after last iteration as it would duplicate check in
+         loop.  */
+
+#define ALIGNED_HEADER_CHECK(i) \
+  if (ALIGNED_HEADER_UNROLL >= i)                                         \
+    {                                                                     \
+      if (_CHECK_N (n <= s_aligned + (i - 1) * LSIZE - s))                \
+	return NULL;                                                      \
+      previous = v;                                                       \
+      v = LOAD (s_aligned + (i - 1) * LSIZE);                             \
+      mask = EXPRESSION (v, cmask);                                       \
+      if (i == 2 && _LOOKAHEAD > 2)					  \
+        mask = forget_bytes (mask, s + _LOOKAHEAD - (s_aligned + LSIZE)); \
+      if (mask)                                                           \
+	RETURN (s_aligned + (i - 1) * LSIZE, mask);   			  \
+    }
+      ALIGNED_HEADER_CHECK (2);
+      ALIGNED_HEADER_CHECK (3);
+      ALIGNED_HEADER_CHECK (4);
+      ALIGNED_HEADER_CHECK (5);
+      ALIGNED_HEADER_CHECK (6);
+      ALIGNED_HEADER_CHECK (7);
+      ALIGNED_HEADER_CHECK (8);
+    }
+
+  /* Now we read enough bytes to start a loop, assuming following:  */
+
+  assert (UNALIGNED_HEADER_UNROLL <= 8);
+  assert (ALIGNED_HEADER_UNROLL <= 8);
+
+  assert (LOOP_UNROLL <= ALIGNED_HEADER_UNROLL);
+  assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+	  || UNALIGNED_HEADER_UNROLL == 0);
+
+  char *s_loop = PTR_ALIGN_DOWN (s, LOOP_SIZE);
+
+#ifdef LOOKAHEAD
+  v = LOAD (s_loop + (LOOP_UNROLL - 1) * LSIZE);
+#endif
+
+#ifdef CHECK_N
+# ifdef SAVE_N_ITERS_REGISTER
+  while (s + n - s_loop >= LOOP_SIZE)
+# else
+  size_t n_iters = (n - (s_loop + LOOP_SIZE - s)
+                      + LOOP_SIZE - 1) / (LOOP_SIZE);
+  while (n_iters--)
+# endif
+#else
+  while (1)
+#endif
+    {
+      s_loop += LOOP_SIZE;
+      vector_int masks[9];
+      masks[0] = 0;
+
+#define MERGE_MASK(i) \
+  if (LOOP_UNROLL >= i)                                                 \
+    {                                                                   \
+      previous = v;                                                     \
+      v = LOAD (s_loop + (i - 1) * LSIZE);                              \
+      masks[i] = masks[i - 1] | EXPRESSION (v, cmask);                  \
+    }
+
+      MERGE_MASK (1);
+      MERGE_MASK (2);
+      MERGE_MASK (3);
+      MERGE_MASK (4);
+      MERGE_MASK (5);
+      MERGE_MASK (6);
+      MERGE_MASK (7);
+      MERGE_MASK (8);
+
+      if (masks[LOOP_UNROLL])
+	{
+          /*
+            Here for optimal tail we need to help gcc. We already computed
+            combined masks, and did partial or. This allow us check first
+            nonzero mask as when previous masks were zero then or of
+            previous masks with current one is equal to current one.
+
+            That is nice but you would need to save that data at each
+            iteration. These that could be keept in registers are fine.
+
+            Problem is that on these which don't where we must spill registers.
+            To make matters worse gcc also thinks that spilling a partial
+            result that is useless after each iteration is worth saving
+            few cycles in recomputing that at last iteration. We prevent
+            that with compiler barrier.
+
+            As without compiling to assembly we don't know how many registers
+            are available you need to try all possible values of
+            TAIL_RECOMPUTE_FIRST to recompute only first i checks.
+            */
+
+#ifndef TAIL_RECOMPUTE_FIRST
+# define TAIL_RECOMPUTE_FIRST 0
+#endif
+       
+# define CHECK_MASK(i) \
+  if (i <= TAIL_RECOMPUTE_FIRST)					\
+    {									\
+      asm volatile ("" : : : "memory");					\
+      previous = LOAD (s_loop + (i - 2) * LSIZE);                       \
+      v = LOAD (s_loop + (i - 1) * LSIZE);                              \
+      mask = EXPRESSION (v, cmask);                                     \
+      if (mask)                                                         \
+        RETURN (s_loop + (i - 1) * LSIZE ,mask);                        \
+    }									\
+  else									\
+    if (masks[i])                                                       \
+      RETURN (s_loop + (i - 1) * LSIZE, masks[i]);
+
+	  CHECK_MASK (1);
+	  CHECK_MASK (2);
+	  CHECK_MASK (3);
+	  CHECK_MASK (4);
+	  CHECK_MASK (5);
+	  CHECK_MASK (6);
+	  CHECK_MASK (7);
+	  CHECK_MASK (8);
+	}
+    }
+  return NULL;
+}