new file mode 100644
@@ -0,0 +1,230 @@
+/* Vectorized arithmetic used in string functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+#include <libc-internal.h>
+
+/* This is intended for optimized string functions by using vectorization.
+
+ Main idea is simple. Most string functions can be described as searching
+ for first byte that satisfies certain expression followed by
+ simple output conversion. For example in strlen we look for first byte x
+ where x == 0 followed by subtracting pointer from start to get length.
+
+ When you have expression you use skeleton to execute it in parallel for
+ eigth bytes at once which will give you considerable speedup.
+
+ You need to make expression from primitives that allow vectorization.
+
+ Bitwise arithmetic (&,|,^,~) is allowed. For most tricks you also need
+ to do addition and subtraction where you must be more careful.
+ If you could ensure that your expression don't overflows then you can
+ use it as well. However expressions that could overflow are considerably
+ faster and you can use them when you are bit careful. When you only want
+ to find first byte where expression is true then you most of time
+ don't care that on success it could corrupt following bytes. However you
+ can't overflow on failure. You need to supply two versions of expression,
+ EXPRESSION macro that can overflow on success and EXPRESSION_NOCARRY that
+ can't.
+
+ Use vector arithmetic tricks. Idea is to take expression works on
+ unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+ Our expression is ((s - 1) & (~s)) & 128
+ Now we evaluate this expression on each byte in parallel and on first
+ nonzero byte our expression will have nonzero value.
+
+ We need to provide version of expression that doesn't cause carry
+ propagation and opperations could be done in parallel. However its
+ not needed on little endian architectures as we end on first byte
+ that succeeds and we don't care that next ones could be corrupted.
+
+ Then you could use premade predicates. There are contains_zero(x) that
+ returns 128 when x is zero, 0 otherwise and bytes_equal(x, y) that
+ returns 128 when x == y, zero otherwise, these come with variants that
+ don't cause overflow.
+
+ For performance architecture with hardware support should redefine
+ primitives. Most often one wants to supply its own first_nonzero_byte
+ and define CUSTOM_FIRST_NONZERO_BYTE to avoid defining default one.
+
+ Others are contains_zero and bytes_equal that need to be redefined
+ along with their nocarry counterparts.
+
+ Having hardware fast bytewise minimum is game changer as it allows
+ considerable speedup. However it would need to create separate skeletons
+ as main benefit is in faster aggregation.
+
+ After that there comes tuning as there are several variables that
+ affect performance like number of times unrolled. These could be
+ automated by running with different options versus dryrun profile
+ trace and selecting best one.
+ */
+
+#ifndef VECTOR_INT
+# define VECTOR_INT unsigned long int
+#endif
+
+#if _STRING_ARCH_unaligned
+# define UNALIGNED_HEADER_UNROLL 4
+#else
+# define UNALIGNED_HEADER_UNROLL 0
+#endif
+#define ALIGNED_HEADER_UNROLL 4
+#define LOOP_UNROLL 4
+
+typedef VECTOR_INT vector_int;
+
+static const vector_int ones = ((~(vector_int) 0) / 255); /* 0x0101...*/
+static const vector_int add = 127 * ((~(vector_int) 0) / 255);
+static const vector_int high_bits = 128 * ((~(vector_int) 0) / 255);
+
+
+#define LSIZE sizeof (vector_int)
+#ifdef MINIMAL_PAGE_SIZE
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % MINIMAL_PAGE_SIZE > MINIMAL_PAGE_SIZE - n)
+#else
+# define CROSS_PAGE(x, n) (((uintptr_t) x) % 4096 > 4096 - n)
+#endif
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# define SHIFT_BYTES(x, n) ((x) << (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) >> (8 * (n)))
+#else
+# define SHIFT_BYTES(x, n) ((x) >> (8 * (n)))
+# define SHIFT_BYTES_UP(x, n) ((x) << (8 * (n)))
+#endif
+
+/* Sets n first bytes to zero. */
+static __always_inline
+vector_int
+forget_bytes (vector_int v, int n)
+{
+ if (n <= 0)
+ return v;
+ /* This check could be deleted when shift does zeroing. */
+ if (n >= LSIZE)
+ return 0;
+ return SHIFT_BYTES_UP (SHIFT_BYTES (v, n), n);
+}
+
+
+/* Load vector. Needs to be macro for cast.
+ While for LOAD(x) needs to be aligned for LOADU it dont have to.
+ Unaligned loads are emulated on platforms that dont support it. */
+
+#define LOAD(x) (*((vector_int *) (x)))
+#if _STRING_ARCH_unaligned == 0
+/* Here we could combine shifts if architecture sets x << 64 to zero. */
+
+# define LOADU(x) ({ \
+ char *aligned = PTR_ALIGN_DOWN ((char *) (x), LSIZE); \
+ unsigned int align = ((char *) (x)) - aligned; \
+ (SHIFT_BYTES (SHIFT_BYTES (LOAD (aligned), LSIZE - 1 - align), 1) \
+ | SHIFT_BYTES_UP (LOAD (aligned + LSIZE), align)); \
+ })
+
+#else
+# define LOADU(x) LOAD (x)
+#endif
+
+#ifndef CUSTOM_BROADCAST
+static __always_inline
+vector_int
+broadcast (unsigned char c)
+{
+ return ones * c;
+}
+#endif
+
+#ifndef CUSTOM_CONTAINS_ZERO
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+/* A possible question is how fast is byteswap on big endian
+ architectures. If it can be done withing cycle it migth be
+ profitable to emulate little endian there by overriding LOAD and LOADU. */
+
+static __always_inline
+vector_int
+contains_zero (vector_int s)
+{
+ return (s - ones) & ~s & high_bits;
+}
+# else
+# define contains_zero contains_zero_nocarry
+# endif
+
+static __always_inline
+vector_int
+contains_zero_nocarry (vector_int s)
+{
+ return (((s | high_bits) - ones) ^ high_bits) & ~s & high_bits;
+}
+#endif
+
+#ifndef CUSTOM_BYTES_EQUAL
+static __always_inline
+vector_int
+bytes_equal (vector_int x, vector_int y)
+{
+ return contains_zero (x ^ y);
+}
+
+static __always_inline
+vector_int
+bytes_equal_nocarry (vector_int x, vector_int y)
+{
+ return contains_zero_nocarry (x ^ y);
+}
+#endif
+
+/*
+ When you have hardware ctz/clz its probably best bet. However
+ for softare emulation you could get better than generic one as you
+ dont need to consider each bit, just highest bits in byte which can be
+ calculated more effectively.
+ */
+#ifndef CUSTOM_FIRST_NONZERO_BYTE
+static __always_inline
+size_t
+first_nonzero_byte (vector_int u)
+{
+# if __BYTE_ORDER == __BIG_ENDIAN
+# ifdef NEED_BITWISE
+ u = u | (u >> 1);
+ u = u | (u >> 2);
+ u = u | (u >> 4);
+# else
+ u = u >> 7;
+# endif
+ u = u | (u >> 8);
+ u = u | (u >> 16);
+ u = u | (u >> 32);
+# ifdef NEED_BITWISE
+ u = u & ones;
+# endif
+ u = u * ones;
+ return 8 - (u >> (8 * LSIZE - 8));
+
+# else
+ /* Note that this works also in bitwise case. */
+ u = u ^ (u - 1);
+ u = u & ones;
+ u = u * ones;
+ return (u >> (8 * LSIZE - 8)) - 1;
+# endif
+}
+#endif
new file mode 100644
@@ -0,0 +1,261 @@
+/* Skeleton of generic string functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+
+/* On high endian an positive could cause false positive in previous byte. */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+#ifdef CUSTOM_CMASK
+# define CMASK_PARAM_MASK CMASK_PARAM
+#else
+# define CMASK_PARAM int c_in
+# define CMASK_PARAM_MASK vector_int cmask
+#endif
+
+#ifdef LOOKAHEAD
+# define BYTE(n) \
+ (SHIFT_BYTES (SHIFT_BYTES (previous, LSIZE - (LOOKAHEAD - n) - 1), 1) \
+ | SHIFT_BYTES_UP (v, LOOKAHEAD - n))
+#endif
+
+#ifdef LOOKAHEAD
+# define _LOOKAHEAD LOOKAHEAD
+#else
+# define _LOOKAHEAD 0
+#endif
+
+#ifdef CHECK_N
+# define _CHECK_N(x) x
+#else
+# define _CHECK_N(x) 0
+#endif
+
+#define LOOP_SIZE (LOOP_UNROLL * LSIZE)
+
+#define RETURN(x, mask) \
+ do \
+ { \
+ char *ret = x + first_nonzero_byte (mask); \
+ return (_CHECK_N (n <= ret - s)) ? NULL : ret - _LOOKAHEAD; \
+ } \
+ while (0)
+
+static __always_inline
+char *
+string_skeleton (const char *s_in, CMASK_PARAM, size_t n)
+{
+ vector_int mask;
+ char *s = (char *) s_in;
+ vector_int v = 0;
+ vector_int __attribute__ ((unused)) previous = 0;
+#ifndef CUSTOM_CMASK
+ unsigned char c = (unsigned char) c_in;
+ vector_int __attribute__ ((unused)) cmask = broadcast (c);
+#endif
+
+ if (_CHECK_N(n == 0))
+ return NULL;
+
+ /* We fetch 32 bytes while not crossing page boundary.
+ Most strings in practice are of that size and we avoid a loop.
+ This looks as best in practice, alternative below uses aligned load
+ but is slower when string starts just few
+ bytes before 32 byte boundary. A tradeoff is that we rarely could
+ fetch extra cache line without needing it but this optimization
+ does pay for that. */
+
+ if (!CROSS_PAGE (s, LSIZE * UNALIGNED_HEADER_UNROLL)
+ && UNALIGNED_HEADER_UNROLL > 0)
+ {
+#define UNALIGNED_HEADER_CHECK(i) \
+ if (UNALIGNED_HEADER_UNROLL >= i) \
+ { \
+ previous = v; \
+ v = LOADU (s + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ if (i == 1) \
+ mask = forget_bytes (mask, _LOOKAHEAD); \
+ if (mask) \
+ RETURN (s + (i - 1) * LSIZE, mask); \
+ }
+
+ UNALIGNED_HEADER_CHECK (1);
+ UNALIGNED_HEADER_CHECK (2);
+ UNALIGNED_HEADER_CHECK (3);
+ UNALIGNED_HEADER_CHECK (4);
+ UNALIGNED_HEADER_CHECK (5);
+ UNALIGNED_HEADER_CHECK (6);
+ UNALIGNED_HEADER_CHECK (7);
+ UNALIGNED_HEADER_CHECK (8);
+
+ if (_CHECK_N (n <= LSIZE * UNALIGNED_HEADER_UNROLL))
+ return NULL;
+ }
+ else
+ {
+ /* We need use aligned loads. For first load we read some bytes before
+ start that we discard by shifting them down. */
+
+ char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+ v = LOAD (s_aligned);
+ /* We need be careful here as bytes before start can corrupt it. */
+ mask = SHIFT_BYTES ((EXPRESSION_NOCARRY (v, cmask)), s - s_aligned);
+ mask = forget_bytes (mask, _LOOKAHEAD);
+ if (mask)
+ RETURN (s, mask);
+
+ /* When lookahead is i we need to ignore matches in first i bytes as
+ false positives. For lookahead 2 or more it could cross word
+ boundary and we need to mask these.
+
+ We need to check for crossing end of string after each iteration
+ as each one could cross page boundary. Note that we don't have
+ to make check after last iteration as it would duplicate check in
+ loop. */
+
+#define ALIGNED_HEADER_CHECK(i) \
+ if (ALIGNED_HEADER_UNROLL >= i) \
+ { \
+ if (_CHECK_N (n <= s_aligned + (i - 1) * LSIZE - s)) \
+ return NULL; \
+ previous = v; \
+ v = LOAD (s_aligned + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ if (i == 2 && _LOOKAHEAD > 2) \
+ mask = forget_bytes (mask, s + _LOOKAHEAD - (s_aligned + LSIZE)); \
+ if (mask) \
+ RETURN (s_aligned + (i - 1) * LSIZE, mask); \
+ }
+ ALIGNED_HEADER_CHECK (2);
+ ALIGNED_HEADER_CHECK (3);
+ ALIGNED_HEADER_CHECK (4);
+ ALIGNED_HEADER_CHECK (5);
+ ALIGNED_HEADER_CHECK (6);
+ ALIGNED_HEADER_CHECK (7);
+ ALIGNED_HEADER_CHECK (8);
+ }
+
+ /* Now we read enough bytes to start a loop, assuming following: */
+
+ assert (UNALIGNED_HEADER_UNROLL <= 8);
+ assert (ALIGNED_HEADER_UNROLL <= 8);
+
+ assert (LOOP_UNROLL <= ALIGNED_HEADER_UNROLL);
+ assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+ || UNALIGNED_HEADER_UNROLL == 0);
+
+ char *s_loop = PTR_ALIGN_DOWN (s, LOOP_SIZE);
+
+#ifdef LOOKAHEAD
+ v = LOAD (s_loop + (LOOP_UNROLL - 1) * LSIZE);
+#endif
+
+#ifdef CHECK_N
+# ifdef SAVE_N_ITERS_REGISTER
+ while (s + n - s_loop >= LOOP_SIZE)
+# else
+ size_t n_iters = (n - (s_loop + LOOP_SIZE - s)
+ + LOOP_SIZE - 1) / (LOOP_SIZE);
+ while (n_iters--)
+# endif
+#else
+ while (1)
+#endif
+ {
+ s_loop += LOOP_SIZE;
+ vector_int masks[9];
+ masks[0] = 0;
+
+#define MERGE_MASK(i) \
+ if (LOOP_UNROLL >= i) \
+ { \
+ previous = v; \
+ v = LOAD (s_loop + (i - 1) * LSIZE); \
+ masks[i] = masks[i - 1] | EXPRESSION (v, cmask); \
+ }
+
+ MERGE_MASK (1);
+ MERGE_MASK (2);
+ MERGE_MASK (3);
+ MERGE_MASK (4);
+ MERGE_MASK (5);
+ MERGE_MASK (6);
+ MERGE_MASK (7);
+ MERGE_MASK (8);
+
+ if (masks[LOOP_UNROLL])
+ {
+ /*
+ Here for optimal tail we need to help gcc. We already computed
+ combined masks, and did partial or. This allow us check first
+ nonzero mask as when previous masks were zero then or of
+ previous masks with current one is equal to current one.
+
+ That is nice but you would need to save that data at each
+ iteration. These that could be keept in registers are fine.
+
+ Problem is that on these which don't where we must spill registers.
+ To make matters worse gcc also thinks that spilling a partial
+ result that is useless after each iteration is worth saving
+ few cycles in recomputing that at last iteration. We prevent
+ that with compiler barrier.
+
+ As without compiling to assembly we don't know how many registers
+ are available you need to try all possible values of
+ TAIL_RECOMPUTE_FIRST to recompute only first i checks.
+ */
+
+#ifndef TAIL_RECOMPUTE_FIRST
+# define TAIL_RECOMPUTE_FIRST 0
+#endif
+
+# define CHECK_MASK(i) \
+ if (i <= TAIL_RECOMPUTE_FIRST) \
+ { \
+ asm volatile ("" : : : "memory"); \
+ previous = LOAD (s_loop + (i - 2) * LSIZE); \
+ v = LOAD (s_loop + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v, cmask); \
+ if (mask) \
+ RETURN (s_loop + (i - 1) * LSIZE ,mask); \
+ } \
+ else \
+ if (masks[i]) \
+ RETURN (s_loop + (i - 1) * LSIZE, masks[i]);
+
+ CHECK_MASK (1);
+ CHECK_MASK (2);
+ CHECK_MASK (3);
+ CHECK_MASK (4);
+ CHECK_MASK (5);
+ CHECK_MASK (6);
+ CHECK_MASK (7);
+ CHECK_MASK (8);
+ }
+ }
+ return NULL;
+}