@@ -137,6 +137,8 @@ init_library (void)
#ifdef ENABLE_NLS
(void) bindtextdomain (PACKAGE, LOCALEDIR);
#endif
+
+ init_vectorized_lexer ();
}
}
@@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
return fputs ((const char *)s, f);
}
+extern void init_vectorized_lexer (void);
+
#ifdef __cplusplus
}
#endif
@@ -96,6 +96,408 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
buffer->notes_used++;
}
+
+/* Fast path to find line special characters using optimized character
+ scanning algorithms. Anything complicated falls back to the slow
+ path below. Since this loop is very hot it's worth doing these kinds
+ of optimizations.
+
+ One of the paths through the ifdefs should provide
+
+ bool search_line_fast (const uchar *s, const uchar *end,
+ const uchar **out)
+
+ Between S and END, search for \n, \r, \\, ?. Return true if found.
+ Always update *OUT to the last character scanned, even if not found. */
+
+/* ??? Should be in configure.ac. */
+#define WORDS_BIGENDIAN 0
+
+/* We'd like the largest integer that fits into a register. There's nothing
+ in <stdint.h> that gives us that. For most hosts this is unsigned long,
+ but MS decided on an LLP64 model. Thankfully when building with GCC we
+ can get the "real" word size. */
+#ifdef __GNUC__
+typedef unsigned int word_type __attribute__((__mode__(__word__)));
+#else
+typedef unsigned long word_type;
+#endif
+
+/* Return X with the first N bytes forced to values that won't match one
+ of the interesting characters. Note that NUL is not interesting. */
+
+static inline word_type
+acc_char_mask_misalign (word_type val, unsigned int n)
+{
+ word_type mask = -1;
+ if (WORDS_BIGENDIAN)
+ mask >>= n * 8;
+ else
+ mask <<= n * 8;
+ return val & mask;
+}
+
+/* Return X replicated to all byte positions within WORD_TYPE. */
+
+static inline word_type
+acc_char_replicate (uchar x)
+{
+ word_type ret;
+
+ ret = (x << 24) | (x << 16) | (x << 8) | x;
+ switch (sizeof (ret))
+ {
+ case 8:
+ ret = (ret << 16 << 16) | ret;
+ break;
+ case 4:
+ break;
+ default:
+ abort ();
+ }
+ return ret;
+}
+
+/* Return non-zero if some byte of VAL is (probably) C. */
+
+static inline word_type
+acc_char_cmp (word_type val, word_type c)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+ /* We can get exact results using a compare-bytes instruction. Since
+ comparison is always >=, we could either do ((v >= c) & (c >= v))
+ for 3 operations or force matching bytes to zero and compare vs
+ zero with (0 >= (val ^ c)) for 2 operations. */
+ return __builtin_alpha_cmpbge (0, val ^ c);
+#elif defined(__GNUC__) && defined(__ia64__)
+ /* ??? Ideally we'd have some sort of builtin for this, so that we
+ can pack the 4 instances into fewer bundles. */
+ word_type ret;
+ __asm__("pcmp1.eq %0 = %1, %2" : "=r"(ret) : "r"(val), "r"(c));
+ return ret;
+#else
+ word_type magic = 0x7efefefeU;
+ switch (sizeof(word_type))
+ {
+ case 8:
+ magic = (magic << 16 << 16) | 0xfefefefeU;
+ case 4:
+ break;
+ default:
+ abort ();
+ }
+ magic |= 1;
+
+ val ^= c;
+ return ((val + magic) ^ ~val) & ~magic;
+#endif
+}
+
+/* Given the result of acc_char_cmp is non-zero, return the index of
+ the found character. If this was a false positive, return -1. */
+
+static inline int
+acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
+ word_type val ATTRIBUTE_UNUSED)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+ /* The cmpbge instruction sets *bits* of the result corresponding to
+ matches in the bytes with no false positives. This means that clz/ctx
+ produces a correct unscaled result. */
+ return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp));
+#elif defined(__GNUC__) && defined(__ia64__)
+ /* The pcmp1 instruction sets matching bytes to 0xff with no false
+ positives. This means that clz/ctz produces a correct scaled result. */
+ return (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp)) >> 3;
+#else
+ unsigned int i;
+
+ /* ??? It would be nice to force unrolling here,
+ and have all of these constants folded. */
+ for (i = 0; i < sizeof(word_type); ++i)
+ {
+ uchar c;
+ if (WORDS_BIGENDIAN)
+ c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
+ else
+ c = (val >> i * 8) & 0xff;
+
+ if (c == '\n' || c == '\r' || c == '\\' || c == '?')
+ return i;
+ }
+
+ return -1;
+#endif
+}
+
+/* A version of the fast scanner using bit fiddling techniques.
+
+ For 32-bit words, one would normally perform 16 comparisons and
+ 16 branches. With this algorithm one performs 24 arithmetic
+ operations and one branch. Whether this is faster with a 32-bit
+ word side is going to be somewhat system dependent.
+
+ For 64-bit words, we eliminate twice the number of comparisons
+ and branches without increasing the number of arithmetic operations.
+ It's almost certainly going to be a win with 64-bit word size. */
+
+static bool
+search_line_acc_char (const uchar *s, const uchar *end, const uchar **out)
+{
+ const word_type repl_nl = acc_char_replicate ('\n');
+ const word_type repl_cr = acc_char_replicate ('\r');
+ const word_type repl_bs = acc_char_replicate ('\\');
+ const word_type repl_qm = acc_char_replicate ('?');
+
+ unsigned int misalign;
+ ptrdiff_t left;
+ const word_type *p;
+ word_type val;
+
+ /* Don't bother optimizing very short lines; too much masking to do. */
+ left = end - s;
+ if (left < (ptrdiff_t) sizeof(word_type))
+ {
+ *out = s;
+ return false;
+ }
+
+ /* Align the buffer. Mask out any bytes from before the beginning. */
+ p = (word_type *)((uintptr_t)s & -sizeof(word_type));
+ val = *p;
+ misalign = (uintptr_t)s & (sizeof(word_type) - 1);
+ if (misalign)
+ {
+ val = acc_char_mask_misalign (val, misalign);
+ left += misalign;
+ }
+
+ /* Main loop. */
+ while (1)
+ {
+ word_type m_nl, m_cr, m_bs, m_qm, t;
+
+ m_nl = acc_char_cmp (val, repl_nl);
+ m_cr = acc_char_cmp (val, repl_cr);
+ m_bs = acc_char_cmp (val, repl_bs);
+ m_qm = acc_char_cmp (val, repl_qm);
+ t = (m_nl | m_cr) | (m_bs | m_qm);
+
+ if (__builtin_expect (t != 0, 0))
+ {
+ int i = acc_char_index (t, val);
+ if (i >= 0)
+ {
+ *out = (const uchar *)p + i;
+ return true;
+ }
+ }
+
+ left -= sizeof(word_type);
+ ++p;
+ if (left < (ptrdiff_t) sizeof(word_type))
+ {
+ /* Ran out of complete words. */
+ *out = (const uchar *)p;
+ return false;
+ }
+ val = *p;
+ }
+}
+
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+/* We'd like to share this table between the sse2 and sse4.2 functions
+ below, but we can't define an SSE vector type outside of a context
+ of a function that forces SSE enabled. So we must define this table
+ in terms of characters. */
+static const uchar sse_mask_align[16][16] __attribute__((aligned(16))) =
+{
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 }
+};
+
+/* Fast path to find line special characters using SSE 4.2 vectorized string
+ instructions. Anything complicated falls back to the slow path below.
+ Since this loop is very hot it's worth doing these kinds of
+ optimizations. Returns true if stopper character found.
+
+ We should be using the _mm intrinsics, but the xxxintr headers do things
+ not allowed in gcc. So instead use direct builtins. */
+
+static bool __attribute__((__target__("sse4.2")))
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+ typedef char m128i __attribute__ ((__vector_size__ (16)));
+ static const m128i search
+ = { '\n', '\r', '?', '\\', 0,0,0,0,0,0,0,0,0,0,0,0 };
+
+ ptrdiff_t left;
+ unsigned int misalign;
+ const m128i *p;
+ m128i data;
+
+ left = end - s;
+ if (left <= 0)
+ goto no_match;
+
+ /* Align the pointer and mask out the bytes before the start. */
+ misalign = (uintptr_t)s & 15;
+ p = (m128i *)((uintptr_t)s & -16);
+ data = *p;
+ if (misalign)
+ {
+ data |= ((m128i *)sse_mask_align)[misalign];
+ left += misalign;
+ }
+
+ while (1)
+ {
+ int index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+ if (__builtin_expect (index < 16, 0))
+ {
+ *out = (const uchar *)p + index;
+ return true;
+ }
+
+ left -= 16;
+ if (left <= 0)
+ goto no_match;
+
+ data = *++p;
+ }
+
+ no_match:
+ /* No match within buffer. */
+ *out = end;
+ return false;
+}
+
+static bool __attribute__((__target__("sse2")))
+search_line_sse2 (const uchar *s, const uchar *end, const uchar **out)
+{
+ typedef char m128i __attribute__ ((__vector_size__ (16)));
+
+ static const m128i repl_nl = {
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+ };
+ static const m128i repl_cr = {
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+ };
+ static const m128i repl_bs = {
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+ };
+ static const m128i repl_qm = {
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ };
+
+ ptrdiff_t left;
+ unsigned int misalign, mask;
+ const m128i *p;
+ m128i data, t;
+
+ left = end - s;
+ if (left <= 0)
+ goto no_match;
+
+ /* Align the pointer and mask out the bytes before the start. */
+ misalign = (uintptr_t)s & 15;
+ p = (m128i *)((uintptr_t)s & -16);
+ data = *p;
+ if (misalign)
+ {
+ data |= ((const m128i *)sse_mask_align)[misalign];
+ left += misalign;
+ }
+
+ while (1)
+ {
+ t = __builtin_ia32_pcmpeqb128(data, repl_nl);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+ mask = __builtin_ia32_pmovmskb128 (t);
+
+ if (__builtin_expect (mask, 0))
+ break;
+
+ left -= 16;
+ if (left <= 0)
+ goto no_match;
+ data = *++p;
+ }
+
+ /* If there were less than 16 bytes left in the buffer, then there will
+ be comparisons beyond the end of buffer. Mask them out. */
+ if (left < 16)
+ {
+ mask &= (2u << left) - 1;
+ if (mask == 0)
+ goto no_match;
+ }
+
+ s = (const uchar *)p + __builtin_ctz (mask);
+ return true;
+
+ no_match:
+ /* No match within buffer. */
+ *out = end;
+ return false;
+}
+
+/* Check if CPU supports vectorized string instructions. */
+
+#include "../gcc/config/i386/cpuid.h"
+
+static bool (*search_line_fast)(const uchar *, const uchar *, const uchar **)
+ = search_line_acc_char;
+
+void
+init_vectorized_lexer (void)
+{
+ unsigned dummy, ecx, edx;
+
+ if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx))
+ {
+ if (ecx & bit_SSE4_2)
+ search_line_fast = search_line_sse42;
+ else if (edx & bit_SSE2)
+ search_line_fast = search_line_sse2;
+ }
+}
+
+#else
+
+/* We only have one accellerated alternative. Use a direct call so that
+ we encourage inlining. We must still provide the init_vectorized_lexer
+ entry point, even though it does nothing. */
+
+#define search_line_fast search_line_acc_char
+
+void
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
/* Returns with a logical line that contains no escaped newlines or
trigraphs. This is a time-critical inner loop. */
void
@@ -109,12 +511,34 @@ _cpp_clean_line (cpp_reader *pfile)
buffer->cur_note = buffer->notes_used = 0;
buffer->cur = buffer->line_base = buffer->next_line;
buffer->need_line = false;
- s = buffer->next_line - 1;
+ s = buffer->next_line;
if (!buffer->from_stage3)
{
const uchar *pbackslash = NULL;
+ found_bs:
+ /* Perform an optimized search for \n, \r, \\, ?. */
+ if (search_line_fast (s, buffer->rlimit, &s))
+ {
+ c = *s;
+
+ /* Special case for backslash which is reasonably common.
+ Continue searching using the fast path. */
+ if (c == '\\')
+ {
+ pbackslash = s;
+ s++;
+ goto found_bs;
+ }
+ if (__builtin_expect (c == '?', false))
+ goto found_qm;
+ else
+ goto found_nl_cr;
+ }
+
+ s--;
+
/* Short circuit for the common case of an un-escaped line with
no trigraphs. The primary win here is by not writing any
data back to memory until we have to. */
@@ -124,6 +548,7 @@ _cpp_clean_line (cpp_reader *pfile)
if (__builtin_expect (c == '\n', false)
|| __builtin_expect (c == '\r', false))
{
+ found_nl_cr:
d = (uchar *) s;
if (__builtin_expect (s == buffer->rlimit, false))
@@ -157,26 +582,28 @@ _cpp_clean_line (cpp_reader *pfile)
}
if (__builtin_expect (c == '\\', false))
pbackslash = s;
- else if (__builtin_expect (c == '?', false)
- && __builtin_expect (s[1] == '?', false)
- && _cpp_trigraph_map[s[2]])
+ else if (__builtin_expect (c == '?', false))
{
- /* Have a trigraph. We may or may not have to convert
- it. Add a line note regardless, for -Wtrigraphs. */
- add_line_note (buffer, s, s[2]);
- if (CPP_OPTION (pfile, trigraphs))
+ found_qm:
+ if (__builtin_expect (s[1] == '?', false)
+ && _cpp_trigraph_map[s[2]])
{
- /* We do, and that means we have to switch to the
- slow path. */
- d = (uchar *) s;
- *d = _cpp_trigraph_map[s[2]];
- s += 2;
- break;
+ /* Have a trigraph. We may or may not have to convert
+ it. Add a line note regardless, for -Wtrigraphs. */
+ add_line_note (buffer, s, s[2]);
+ if (CPP_OPTION (pfile, trigraphs))
+ {
+ /* We do, and that means we have to switch to the
+ slow path. */
+ d = (uchar *) s;
+ *d = _cpp_trigraph_map[s[2]];
+ s += 2;
+ break;
+ }
}
}
}
-
for (;;)
{
c = *++s;
@@ -184,7 +611,7 @@ _cpp_clean_line (cpp_reader *pfile)
if (c == '\n' || c == '\r')
{
- /* Handle DOS line endings. */
+ /* Handle DOS line endings. */
if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
s++;
if (s == buffer->rlimit)
@@ -215,9 +642,8 @@ _cpp_clean_line (cpp_reader *pfile)
}
else
{
- do
+ while (*s != '\n' && *s != '\r')
s++;
- while (*s != '\n' && *s != '\r');
d = (uchar *) s;
/* Handle DOS line endings. */
@@ -29,6 +29,9 @@ along with GCC; see the file COPYING3. If not see
#ifdef HAVE_STDDEF_H
# include <stddef.h>
#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
#include <stdio.h>