new file mode 100644
@@ -0,0 +1,211 @@
+/* Vectorized arithmetic used in string functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+
+/* This is intended for optimized string functions by using vectorization.
+
+ Main idea is simple. Most string functions can be described as searching
+ for first byte that satisfies certain expression followed by
+ simple output conversion. For example in strlen we look for first byte x
+ where x == 0 followed by subtracting pointer from start to get length.
+
+ When you have expression you use skeleton to execute it in parallel for
+ eigth bytes at once which will give you considerable speedup.
+
+ You need to make expression from primitives that allow vectorization.
+
+ Bitwise arithmetic (&,|,^,~) is allowed. For most tricks you also need
+ to do addition and subtraction where you must be more careful.
+ If you could ensure that your expression don't overflows then you can
+ use it as well. However expressions that could overflow are considerably
+ faster and you can use them when you are bit careful. When you only want
+ to find first byte where expression is true then you most of time
+ don't care that on success it could corrupt following bytes. However you
+ can't overflow on failure. You need to supply two versions of expression,
+ EXPRESSION macro that can overflow on success and EXPRESSION_NOCARRY that
+ can't.
+
+ Use vector arithmetic tricks. Idea is to take expression works on
+ unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+ Our expression is ((s - 1) & (~s)) & 128
+ Now we evaluate this expression on each byte in parallel and on first
+ nonzero byte our expression will have nonzero value.
+
+ We need to provide version of expression that doesn't cause carry
+ propagation and opperations could be done in parallel. However its
+ not needed on little endian architectures as we end on first byte
+ that succeeds and we don't care that next ones could be corrupted.
+
+ Then you could use premade predicates. There are contains_zero(x) that
+ returns 128 when x is zero, 0 otherwise and bytes_equal(x, y) that
+ returns 128 when x == y, zero otherwise, these come with variants that
+ don't cause overflow.
+
+ For performance architecture with hardware support should redefine
+ primitives. Most often one wants to supply its own first_nonzero_byte
+ and define CUSTOM_FIRST_NONZERO_BYTE to avoid defining default one.
+
+ Others are contains_zero and bytes_equal that need to be redefined
+ along with their nocarry counterparts.
+
+ Having hardware fast bytewise minimum is game changer as it allows
+ considerable speedup. However it would need to create separate skeletons
+ as main benefit is in faster aggregation.
+
+ After that there comes tuning as there are several variables that
+ affect performance like number of times unrolled. These could be
+ automated by running with different options versus dryrun profile
+ trace and selecting best one.
+ */
+
+#ifndef VECTOR_INT
+# define VECTOR_INT unsigned long int
+#endif
+
+#if _STRING_ARCH_unaligned
+# define UNALIGNED_HEADER_UNROLL 4
+#else
+# define UNALIGNED_HEADER_UNROLL 0
+#endif
+#define ALIGNED_HEADER_UNROLL 4
+#define LOOP_UNROLL 4
+
+typedef VECTOR_INT vector_int;
+
+static const vector_int ones = (~0UL / 255); /* 0x0101...*/
+static const vector_int add = 127 * (~0UL / 255);
+static const vector_int high_bits = 128 * (~0UL / 255);
+
+
+#define LSIZE sizeof (vector_int)
+#ifdef PAGE_SIZE
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % PAGE_SIZE > PAGE_SIZE - n)
+#else
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % 4096 > 4096 - n)
+#endif
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define SHIFT_BYTES(x, n) ((x) << (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) >> (8 * (n)))
+#else
+# define SHIFT_BYTES(x, n) ((x) >> (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) << (8 * (n)))
+#endif
+
+/* Sets n first bytes to zero. */
+#define FORGET_BYTES(x, n) SHIFT_BYTES_UP (SHIFT_BYTES (x, n), n)
+
+
+/* Load vector. Needs to be macro for cast.
+ While for LOAD(x) needs to be aligned for LOADU it dont have to.
+ Unaligned loads are emulated on platforms that dont support it. */
+
+#define LOAD(x) (*((vector_int *) (x)))
+#if _STRING_ARCH_unaligned == 0
+/* Here we could combine shifts if architecture sets x << 64 to zero. */
+
+# define LOADU(x) ({ \
+ char *aligned = PTR_ALIGN_DOWN ((char *) (x), LSIZE); \
+ unsigned int align = ((char *) (x)) - aligned; \
+ (SHIFT_BYTES (SHIFT_BYTES (LOAD (aligned), LSIZE - 1 - align), 1) \
+ | SHIFT_BYTES_UP (LOAD (aligned + LSIZE), align)); \
+ })
+
+#else
+# define LOADU(x) LOAD (x)
+#endif
+
+
+#ifndef CUSTOM_CONTAINS_ZERO
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+/* A possible question is how fast is byteswap on big endian
+ architectures. If it can be done withing cycle it migth be
+ profitable to emulate little endian there by overriding LOAD and LOADU. */
+
+static __always_inline
+vector_int
+contains_zero (vector_int s)
+{
+ return (s - ones) & ~s & high_bits;
+}
+# else
+# define contains_zero contains_zero_nocarry
+# endif
+
+static __always_inline
+vector_int
+contains_zero_nocarry (vector_int s)
+{
+ return (((s | high_bits) - ones) ^ high_bits) & ~s & high_bits;
+}
+#endif
+
+#ifndef CUSTOM_BYTES_EQUAL
+static __always_inline
+vector_int
+bytes_equal (vector_int x, vector_int y)
+{
+ return contains_zero (x ^ y);
+}
+
+static __always_inline
+vector_int
+bytes_equal_nocarry (vector_int x, vector_int y)
+{
+ return contains_zero_nocarry (x ^ y);
+}
+#endif
+
+/*
+ When you have hardware ctz/clz its probably best bet. However
+ for softare emulation you could get better than generic one as you
+ dont need to consider each bit, just highest bits in byte which can be
+ calculated more effectively.
+ */
+#ifndef CUSTOM_FIRST_NONZERO_BYTE
+static __always_inline
+size_t
+first_nonzero_byte (vector_int u)
+{
+# if __BYTE_ORDER == __BIG_ENDIAN
+# ifdef NEED_BITWISE
+ u = u | (u >> 1);
+ u = u | (u >> 2);
+ u = u | (u >> 4);
+# else
+ u = u >> 7;
+# endif
+ u = u | (u >> 8);
+ u = u | (u >> 16);
+ u = u | (u >> 32);
+# ifdef NEED_BITWISE
+ u = u & ones;
+# endif
+ u = u * ones;
+ return 8 - (u >> (8 * LSIZE - 8));
+
+# else
+ /* Note that this works also in bitwise case. */
+ u = u ^ (u - 1);
+ u = u & ones;
+ u = u * ones;
+ return (u >> (8 * LSIZE - 8)) - 1;
+# endif
+}
+#endif
new file mode 100644
@@ -0,0 +1,239 @@
+/* Skeleton of generic string functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+#ifndef BOUND
+# define BOUND(x) 0
+#endif
+
+/* On high endian an positive could cause false positive in previous byte. */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+#ifdef CUSTOM_CMASK
+# define CMASK_PARAM_MASK CMASK_PARAM
+#else
+# define CMASK_PARAM int c_in
+# define CMASK_PARAM_MASK vector_int cmask
+#endif
+
+#ifdef LOOKAHEAD
+# define BYTE(n) \
+ (SHIFT_BYTES (SHIFT_BYTES (previous, LSIZE - (LOOKAHEAD - n) - 1), 1) \
+ | SHIFT_BYTES_UP (v, LOOKAHEAD - n))
+#endif
+
+#ifdef LOOKAHEAD
+# define SHIFT_LOOKAHEAD(x) FORGET_BYTES (x, LOOKAHEAD)
+# define ADJUST LOOKAHEAD
+#else
+# define SHIFT_LOOKAHEAD(x) x
+# define ADJUST 0
+#endif
+
+static __always_inline
+char *
+string_skeleton (const char *s_in, CMASK_PARAM, char *end)
+{
+ vector_int mask;
+ char *s = (char *) s_in;
+ vector_int v = 0;
+ vector_int __attribute__ ((unused)) previous = 0;
+#ifndef CUSTOM_CMASK
+ unsigned char c = (unsigned char) c_in;
+ vector_int __attribute__ ((unused)) cmask = c * ones;
+#endif
+
+ /* We fetch 32 bytes while not crossing page boundary.
+ Most strings in practice are of that size and we avoid a loop.
+ This looks as best in practice, alternative below uses aligned load
+ but is slower when string starts just few
+ bytes before 32 byte boundary. A tradeoff is that we rarely could
+ fetch extra cache line without needing it but this optimization
+ does pay for that. */
+
+ if (!CROSS_PAGE (s, 32) && UNALIGNED_HEADER_UNROLL > 0)
+ {
+#define UNALIGNED_HEADER_CHECK(i) \
+ if (UNALIGNED_HEADER_UNROLL >= i) \
+ { \
+ previous = v; \
+ v = LOADU (s + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ if (i == 1) \
+ mask = SHIFT_LOOKAHEAD (mask); \
+ if (mask) \
+ return s + (i - 1) * LSIZE + first_nonzero_byte (mask) - ADJUST; \
+ }
+
+ UNALIGNED_HEADER_CHECK (1);
+ UNALIGNED_HEADER_CHECK (2);
+ UNALIGNED_HEADER_CHECK (3);
+ UNALIGNED_HEADER_CHECK (4);
+ UNALIGNED_HEADER_CHECK (5);
+ UNALIGNED_HEADER_CHECK (6);
+ UNALIGNED_HEADER_CHECK (7);
+ UNALIGNED_HEADER_CHECK (8);
+
+ if (BOUND (s + LSIZE * UNALIGNED_HEADER_UNROLL))
+ return NULL;
+ }
+ else
+ {
+ /* We need use aligned loads. For first load we read some bytes before
+ start that we discard by shifting them down. */
+
+ char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+ v = LOAD (s_aligned);
+ /* We need be careful here as bytes before start can corrupt it. */
+ mask = SHIFT_BYTES ((EXPRESSION_NOCARRY (v, cmask)), s - s_aligned);
+ mask = SHIFT_LOOKAHEAD (mask);
+ if (mask)
+ return s + first_nonzero_byte (mask) - ADJUST;
+
+ /* When lookahead is i we need to ignore matches in first i bytes as
+ false positives. For lookahead 2 or more it could cross word
+ boundary and we need to mask these. */
+
+#if defined (LOOKAHEAD) && LOOKAHEAD > 1
+# define FIX_LOOKAHEAD(i) \
+ if (i == 2 && s + LOOKAHEAD > s_aligned + LSIZE) \
+ mask = FORGET_BYTES (mask, s + LOOKAHEAD - (s_aligned + LSIZE));
+#else
+# define FIX_LOOKAHEAD(i)
+#endif
+
+ /* We need to check for crossing end of string after each iteration
+ as each one could cross page boundary. Note that we don't have
+ to make check after last iteration as it would duplicate check in
+ loop. */
+
+#define ALIGNED_HEADER_CHECK(i) \
+ if (ALIGNED_HEADER_UNROLL >= i) \
+ { \
+ if (BOUND (s_aligned + (i - 1) + LSIZE)) \
+ return NULL; \
+ previous = v; \
+ v = LOAD (s_aligned + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ FIX_LOOKAHEAD (i); \
+ if (mask) \
+ return s_aligned + (i - 1) * LSIZE + first_nonzero_byte (mask) \
+ - ADJUST; \
+ }
+ ALIGNED_HEADER_CHECK (2);
+ ALIGNED_HEADER_CHECK (3);
+ ALIGNED_HEADER_CHECK (4);
+ ALIGNED_HEADER_CHECK (5);
+ ALIGNED_HEADER_CHECK (6);
+ ALIGNED_HEADER_CHECK (7);
+ ALIGNED_HEADER_CHECK (8);
+ }
+
+ /* Now we read enough bytes to start a loop, assuming following: */
+
+ assert (UNALIGNED_HEADER_UNROLL <= 8);
+ assert (ALIGNED_HEADER_UNROLL <= 8);
+
+ assert (LOOP_UNROLL <= ALIGNED_HEADER_UNROLL);
+ assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+ || UNALIGNED_HEADER_UNROLL == 0);
+
+ char *s_loop = PTR_ALIGN_DOWN (s, LOOP_UNROLL * LSIZE);
+
+#ifdef LOOKAHEAD
+ v = LOAD (s_loop + (LOOP_UNROLL - 1) * LSIZE);
+#endif
+
+ while (!BOUND (s_loop + LOOP_UNROLL * LSIZE))
+ {
+ s_loop += LOOP_UNROLL * LSIZE;
+ vector_int masks[9];
+ masks[0] = 0;
+
+#define MERGE_MASK(i) \
+ if (LOOP_UNROLL >= i) \
+ { \
+ previous = v; \
+ v = LOAD (s_loop + (i - 1) * LSIZE); \
+ masks[i] = masks[i - 1] | EXPRESSION (v, cmask); \
+ }
+
+ MERGE_MASK (1);
+ MERGE_MASK (2);
+ MERGE_MASK (3);
+ MERGE_MASK (4);
+ MERGE_MASK (5);
+ MERGE_MASK (6);
+ MERGE_MASK (7);
+ MERGE_MASK (8);
+
+ if (masks[LOOP_UNROLL])
+ {
+
+ /* Here we have two possibilities depending on register pressure.
+ When you don't have enough registers this recalculating of
+ results would create fastest loop for large inputs. However
+ its likely affected by gcc bug where gcc will try to save
+ intermediate results causing spill in each iteration to speed
+ up final iteration a bit. To avoid that we need compiler barrier
+ here. */
+
+#ifdef RECALCULATE_ON_TAIL
+ asm volatile ("" : : : "memory");
+# define CHECK_MASK(i) \
+ previous = v; \
+ v = LOAD (s_aligned + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ if (mask) \
+ return s_aligned + (i - 1) * LSIZE + first_nonzero_byte (mask) \
+ - ADJUST;
+
+# else
+ /* On other hand when you have enough free registers then you can
+ save intermediate ors. When you checks masks then as you know
+ that to reach each one previous ones must be zero you know that
+ or of previous masks will be exactly current one. */
+
+# define CHECK_MASK(i) \
+ if (masks[i]) \
+ return s_loop + (i - 1) * LSIZE + first_nonzero_byte (masks[i]) - ADJUST;
+# endif
+
+#ifdef LOOKAHEAD
+ v = LOAD (s_loop - LSIZE);
+#endif
+ CHECK_MASK (1);
+ CHECK_MASK (2);
+ CHECK_MASK (3);
+ CHECK_MASK (4);
+ CHECK_MASK (5);
+ CHECK_MASK (6);
+ CHECK_MASK (7);
+ CHECK_MASK (8);
+ }
+ }
+ return NULL;
+}