@@ -41,6 +41,24 @@
# define LOCALE_PARAM_DECL
#endif
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t
+byte_loop (char *x, char *y, size_t n, size_t end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
+
+ return n;
+}
+
+#include <string_diff_skeleton.h>
+
/* Compare S1 and S2, ignoring case, returning less than, equal to or
greater than zero if S1 is lexicographically less than,
equal to or greater than S2. */
@@ -56,9 +74,8 @@ __strcasecmp (s1, s2 LOCALE_PARAM)
const unsigned char *p1 = (const unsigned char *) s1;
const unsigned char *p2 = (const unsigned char *) s2;
int result;
-
- if (p1 == p2)
- return 0;
+
+ diff_skeleton ((char **) &p1, (char **) &p2, 0);
while ((result = TOLOWER (*p1) - TOLOWER (*p2++)) == 0)
if (*p1++ == '\0')
@@ -16,28 +16,37 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-
#undef strcmp
/* Compare S1 and S2, returning less than, equal to or
greater than zero if S1 is lexicographically less than,
equal to or greater than S2. */
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
+
+ return n;
+}
+
+#include <string_diff_skeleton.h>
+
int
-strcmp (const char *p1, const char *p2)
+strcmp (const char *s1_start, const char *s2_start)
{
- const unsigned char *s1 = (const unsigned char *) p1;
- const unsigned char *s2 = (const unsigned char *) p2;
- unsigned char c1, c2;
-
- do
- {
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0')
- return c1 - c2;
- }
- while (c1 == c2);
-
- return c1 - c2;
+ unsigned char *p1 = (unsigned char *) s1_start;
+ unsigned char *p2 = (unsigned char *) s2_start;
+
+ diff_skeleton ((char **) &p1, (char **) &p2, 0);
+
+ return *p1 - *p2;
}
libc_hidden_builtin_def (strcmp)
@@ -43,6 +43,33 @@
# define LOCALE_PARAM_DECL
#endif
+
+
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define CHECK_N
+
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
+ {
+ if (i == end)
+ return SIZE_MAX;
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
+ }
+
+ return n;
+}
+
+#include <string_diff_skeleton.h>
+
+
+
/* Compare no more than N characters of S1 and S2,
ignoring case, returning less than, equal to or
greater than zero if S1 is lexicographically less
@@ -61,7 +88,7 @@ __strncasecmp (s1, s2, n LOCALE_PARAM)
const unsigned char *p2 = (const unsigned char *) s2;
int result;
- if (p1 == p2 || n == 0)
+ if (!diff_skeleton ((char **) &p1, (char **) &p2, n))
return 0;
while ((result = TOLOWER (*p1) - TOLOWER (*p2++)) == 0)
@@ -16,59 +16,44 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-#include <memcopy.h>
#undef strncmp
-#ifndef STRNCMP
-#define STRNCMP strncmp
-#endif
+/* Compare S1 and S2, returning less than, equal to or
+ greater than zero if S1 is lexicographically less than,
+ equal to or greater than S2. */
-/* Compare no more than N characters of S1 and S2,
- returning less than, equal to or greater than zero
- if S1 is lexicographically less than, equal to or
- greater than S2. */
-int
-STRNCMP (const char *s1, const char *s2, size_t n)
-{
- unsigned char c1 = '\0';
- unsigned char c2 = '\0';
-
- if (n >= 4)
- {
- size_t n4 = n >> 2;
- do
- {
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- } while (--n4 > 0);
- n &= 3;
- }
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define CHECK_N
- while (n > 0)
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
{
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- n--;
+ if (i == end)
+ return SIZE_MAX;
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
}
- return c1 - c2;
+ return n;
}
-libc_hidden_builtin_def (STRNCMP)
+#include <string_diff_skeleton.h>
+
+int
+strncmp (const char *s1_start, const char *s2_start, size_t n)
+{
+ unsigned char *p1 = (unsigned char *) s1_start;
+ unsigned char *p2 = (unsigned char *) s2_start;
+
+ if (!diff_skeleton ((char **) &p1, (char **) &p2, n))
+ return 0;
+
+ return *p1 - *p2;
+}
+libc_hidden_builtin_def (strncmp)
@@ -33,6 +33,28 @@
#define LEN 3
+
+
+
+#include <string_vector.h>
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t n, size_t end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
+
+ return n;
+}
+
+#include <string_diff_skeleton.h>
+
+
+
/* Compare S1 and S2 as strings holding indices/version numbers,
returning less than, equal to or greater than zero if S1 is less than,
equal to or greater than S2 (for more info, see the texinfo doc).
@@ -46,6 +68,13 @@ __strverscmp (s1, s2)
const unsigned char *p1 = (const unsigned char *) s1;
const unsigned char *p2 = (const unsigned char *) s2;
+ diff_skeleton ((char **) &p1, (char **) &p2, 0);
+ while (isdigit (*p1) && p1 > (const unsigned char *) s1)
+ {
+ p1--;
+ p2--;
+ }
+
/* Symbol(s) 0 [1-9] others
Transition (10) 0 (01) d (00) x */
static const uint8_t next_state[] =
new file mode 100644
@@ -0,0 +1,279 @@
+/* Skeleton of generic string comparison functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <assert.h>
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+/* On high endian an positive could cause false positive in previous byte. */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+# undef EXPRESSION
+# define EXPRESSION(x, y) EXPRESSION_NOCARRY (x, y)
+#endif
+
+
+#define LOOP_SIZE (LOOP_UNROLL * LSIZE)
+
+
+#ifndef FIRST_CHECK_BYTEWISE
+# define FIRST_CHECK_BYTEWISE 0
+#endif
+
+#ifdef CHECK_N
+# define _CHECK_N(x) x
+#else
+# define _CHECK_N(x) 0
+#endif
+
+/* A strdiff variant. Sets p1 and p2 to possition of first difference.
+ If it read n bytes without difference it return 0, otherwise 1 */
+
+static __always_inline
+int maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned);
+
+static __always_inline
+int
+diff_skeleton (char **p1, char **p2, size_t n)
+{
+ vector_int mask;
+ char *s1 = *p1, *s2 = *p2;
+ vector_int v1, v2;
+ vector_int __attribute__ ((unused)) previous = 0;
+
+#define RETURN(sa, sb, i) \
+ do \
+ { \
+ *p1 = sa + i; \
+ *p2 = sb + i; \
+ return _CHECK_N (n <= sa + i - s1) ? 0 : 1; \
+ } \
+ while (0)
+
+
+ /* Most comparisons differ in first byte and checking that is cheap.
+ A vector check in more expensive so there is question if saving
+ from byte check is more than penalty of branch misprediction.
+ Same reasoning could be applied for second byte with additional
+ penalty of checking first byte. So we need to empirically test
+ whats optimal for given architecture. */
+
+ int i = byte_loop (s1, s2, FIRST_CHECK_BYTEWISE, n);
+ if (i < FIRST_CHECK_BYTEWISE)
+ RETURN (s1, s2, i);
+ if (i == SIZE_MAX)
+ return 0;
+
+ s1 += FIRST_CHECK_BYTEWISE;
+ s2 += FIRST_CHECK_BYTEWISE;
+ n -= FIRST_CHECK_BYTEWISE;
+
+ if (_CHECK_N (n == 0))
+ return 0;
+
+ /* We fetch 32 bytes while not crossing page boundary.
+ Most strings in practice are of that size and we avoid a loop.
+ This looks as best in practice, alternative below uses aligned load
+ but is slower when string starts just few
+ bytes before 32 byte boundary. A tradeoff is that we rarely could
+ fetch extra cache line without needing it but this optimization
+ does pay for that. */
+
+
+ if (!CROSS_PAGE (s1, UNALIGNED_HEADER_UNROLL * LSIZE)
+ && !CROSS_PAGE (s2, UNALIGNED_HEADER_UNROLL * LSIZE)
+ && UNALIGNED_HEADER_UNROLL > 0)
+ {
+
+ /* TODO When emulating unaligned loads we could align s1 after
+ first iteration or if FIRST_CHECK_BYTEWISE >= LSIZE */
+
+#define UNALIGNED_HEADER_CHECK(i) \
+ if (UNALIGNED_HEADER_UNROLL >= i) \
+ { \
+ v1 = LOADU (s1 + (i - 1) * LSIZE); \
+ v2 = LOADU (s2 + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v1, v2); \
+ if (mask) \
+ RETURN (s1 + (i - 1) * LSIZE, s2 + (i - 1) * LSIZE, \
+ first_nonzero_byte (mask)); \
+ }
+
+ UNALIGNED_HEADER_CHECK (1);
+ UNALIGNED_HEADER_CHECK (2);
+ UNALIGNED_HEADER_CHECK (3);
+ UNALIGNED_HEADER_CHECK (4);
+ UNALIGNED_HEADER_CHECK (5);
+ UNALIGNED_HEADER_CHECK (6);
+ UNALIGNED_HEADER_CHECK (7);
+ UNALIGNED_HEADER_CHECK (8);
+
+ if (_CHECK_N (n <= LSIZE * UNALIGNED_HEADER_UNROLL))
+ return 0;
+ }
+ else
+ {
+ int i = byte_loop (s1, s2, LOOP_SIZE, n);
+ if (i < LOOP_SIZE)
+ RETURN (s1, s2, i);
+ if (i == SIZE_MAX)
+ return 0;
+ }
+#ifndef ALIGNING_HELPS
+# define ALIGNING_HELPS 0
+#endif
+ return maybe_aligned_loop (p1, p2, n, ALIGNING_HELPS && ((s1 - s2) % LSIZE == 0));
+}
+
+ /* Now we read enough bytes to start a loop, assuming following: */
+
+static __always_inline
+int
+maybe_aligned_loop (char **p1, char **p2, size_t n, int aligned)
+{
+#define MAYBE_ALIGNED_LOAD(a) (aligned ? LOAD (a) : LOADU(a))
+ vector_int mask;
+ char *s1 = *p1, *s2 = *p2;
+ vector_int v1, v2;
+ vector_int __attribute__ ((unused)) previous = 0;
+
+ assert (LOOP_UNROLL <= UNALIGNED_HEADER_UNROLL
+ || UNALIGNED_HEADER_UNROLL == 0);
+
+ char *s1_loop = PTR_ALIGN_DOWN (s1, LOOP_SIZE);
+ char *s2_loop = s2 - (s1 - s1_loop);
+ int until_cross_page = (4096 - (((uintptr_t) (s2_loop + LOOP_SIZE))
+ % 4096)) / LOOP_SIZE;
+
+#ifdef CHECK_N
+# ifdef SAVE_N_ITERS_REGISTER
+ while (s1 + n - s_loop1 >= LOOP_SIZE)
+# else
+ size_t n_iters = (n - (s1_loop + LOOP_SIZE - s1)
+ + LOOP_SIZE - 1) / (LOOP_SIZE);
+ while (n_iters--)
+# endif
+#else
+ while (1)
+#endif
+ {
+ s1_loop += LOOP_SIZE;
+ s2_loop += LOOP_SIZE;
+
+ if (until_cross_page == 0)
+ {
+ uintptr_t shift = ((uintptr_t) s2_loop) % LOOP_SIZE;
+
+ /* A tricky part here is that we need to mask bytes that could be read
+ in previous iteration, these were already checked and when in header
+ these may lie before start of string. */
+
+#define CROSSING_CHECK(i) \
+ if (i <= LOOP_UNROLL) \
+ { \
+ v1 = MAYBE_ALIGNED_LOAD (s1_loop - shift + (i - 1) * LSIZE); \
+ v2 = LOAD (s2_loop - shift + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v1, v2); \
+ mask = forget_bytes (mask, shift - LSIZE * (i - 1)); \
+ if (mask) \
+ RETURN (s1_loop - shift + (i - 1) * LSIZE, \
+ s2_loop - shift + (i - 1) * LSIZE, \
+ first_nonzero_byte (mask)); \
+ }
+ CROSSING_CHECK (1);
+ CROSSING_CHECK (2);
+ CROSSING_CHECK (3);
+ CROSSING_CHECK (4);
+ CROSSING_CHECK (5);
+ CROSSING_CHECK (6);
+ CROSSING_CHECK (7);
+ CROSSING_CHECK (8);
+
+ until_cross_page = 4096 / LOOP_SIZE;
+ if (_CHECK_N (n <= s1_loop + LOOP_SIZE - shift - s1))
+ return 0;
+ }
+ until_cross_page--;
+
+ vector_int masks[9];
+ masks[0] = 0;
+
+#define MERGE_MASK(i) \
+ if (LOOP_UNROLL >= i) \
+ { \
+ v1 = LOAD (s1_loop + (i - 1) * LSIZE); \
+ v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE); \
+ masks[i] = masks[i - 1] | EXPRESSION (v1, v2); \
+ }
+
+ MERGE_MASK (1);
+ MERGE_MASK (2);
+ MERGE_MASK (3);
+ MERGE_MASK (4);
+ MERGE_MASK (5);
+ MERGE_MASK (6);
+ MERGE_MASK (7);
+ MERGE_MASK (8);
+
+ if (masks[LOOP_UNROLL])
+ {
+
+ /* Here we have two possibilities depending on register pressure.
+ When you don't have enough registers this recalculating of
+ results would create fastest loop for large inputs. However
+ its likely affected by gcc bug where gcc will try to save
+ intermediate results causing spill in each iteration to speed
+ up final iteration a bit. To avoid that we need compiler barrier
+ here. */
+
+#ifdef RECALCULATE_ON_TAIL
+ asm volatile ("" : : : "memory");
+# define CHECK_MASK(i) \
+ v1 = LOAD (s1_loop + (i - 1) * LSIZE); \
+ v2 = MAYBE_ALIGNED_LOAD (s2_loop + (i - 1) * LSIZE); \
+ mask = EXPRESSION (v1, v2); \
+ if (mask) \
+ RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE, \
+ first_nonzero_byte (mask));
+
+# else
+ /* On other hand when you have enough free registers then you can
+ save intermediate ors. When you checks masks then as you know
+ that to reach each one previous ones must be zero you know that
+ or of previous masks will be exactly current one. */
+
+# define CHECK_MASK(i) \
+ if (masks[i]) \
+ RETURN (s1_loop + (i - 1) * LSIZE, s2_loop + (i - 1) * LSIZE, \
+ first_nonzero_byte (masks[i]));
+
+# endif
+
+ CHECK_MASK (1);
+ CHECK_MASK (2);
+ CHECK_MASK (3);
+ CHECK_MASK (4);
+ CHECK_MASK (5);
+ CHECK_MASK (6);
+ CHECK_MASK (7);
+ CHECK_MASK (8);
+ }
+ }
+ return 0;
+}