@@ -1,5 +1,8 @@
/* config.in. Generated from configure.ac by autoheader. */
+/* Define if building universal (internal helper macro) */
+#undef AC_APPLE_UNIVERSAL_BUILD
+
/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
systems. This function is required for `alloca.c' support on those systems.
*/
@@ -209,6 +212,9 @@
/* Define if <sys/types.h> defines \`uchar'. */
#undef HAVE_UCHAR
+/* Define to 1 if the system has the type `uintptr_t'. */
+#undef HAVE_UINTPTR_T
+
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
@@ -266,6 +272,18 @@
/* Define to 1 if your <sys/time.h> declares `struct tm'. */
#undef TM_IN_SYS_TIME
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+ significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+# define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+# undef WORDS_BIGENDIAN
+# endif
+#endif
+
/* Define to empty if `const' does not conform to ANSI C. */
#undef const
@@ -278,8 +296,15 @@
/* Define to `long int' if <sys/types.h> does not define. */
#undef off_t
+/* Define to `int' if <sys/types.h> does not define. */
+#undef ptrdiff_t
+
/* Define to `unsigned int' if <sys/types.h> does not define. */
#undef size_t
/* Define to `int' if <sys/types.h> does not define. */
#undef ssize_t
+
+/* Define to the type of an unsigned integer type wide enough to hold a
+ pointer, if such a type exists, and if the system does not define it. */
+#undef uintptr_t
@@ -1846,6 +1846,48 @@ fi
} # ac_fn_cxx_check_header_mongrel
+# ac_fn_cxx_try_run LINENO
+# ------------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_cxx_try_run ()
+{
+ as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+ if { { ac_try="$ac_link"
+case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_link") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+ { { case "(($ac_try" in
+ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+ *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+ (eval "$ac_try") 2>&5
+ ac_status=$?
+ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+ test $ac_status = 0; }; }; then :
+ ac_retval=0
+else
+ $as_echo "$as_me: program exited with status $ac_status" >&5
+ $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+ ac_retval=$ac_status
+fi
+ rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+ eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
+ return $ac_retval
+
+} # ac_fn_cxx_try_run
+
# ac_fn_cxx_try_link LINENO
# -------------------------
# Try to link conftest.$ac_ext, and return whether this succeeded.
@@ -1946,48 +1988,6 @@ $as_echo "$ac_res" >&6; }
} # ac_fn_cxx_check_type
-# ac_fn_cxx_try_run LINENO
-# ------------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_cxx_try_run ()
-{
- as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
- if { { ac_try="$ac_link"
-case "(($ac_try" in
- *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
- *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
- (eval "$ac_link") 2>&5
- ac_status=$?
- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
- test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
- { { case "(($ac_try" in
- *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
- *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
- (eval "$ac_try") 2>&5
- ac_status=$?
- $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
- test $ac_status = 0; }; }; then :
- ac_retval=0
-else
- $as_echo "$as_me: program exited with status $ac_status" >&5
- $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
- ac_retval=$ac_status
-fi
- rm -rf conftest.dSYM conftest_ipa8_conftest.oo
- eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
- return $ac_retval
-
-} # ac_fn_cxx_try_run
-
# ac_fn_cxx_compute_int LINENO EXPR VAR INCLUDES
# ----------------------------------------------
# Tries to find the compile-time value of EXPR in a program that includes
@@ -5172,6 +5172,230 @@ done
fi
# Checks for typedefs, structures, and compiler characteristics.
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
+$as_echo_n "checking whether byte ordering is bigendian... " >&6; }
+if test "${ac_cv_c_bigendian+set}" = set; then :
+ $as_echo_n "(cached) " >&6
+else
+ ac_cv_c_bigendian=unknown
+ # See if we're dealing with a universal compiler.
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#ifndef __APPLE_CC__
+ not a universal capable compiler
+ #endif
+ typedef int dummy;
+
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+ # Check for potential -arch flags. It is not universal unless
+ # there are at least two -arch flags with different values.
+ ac_arch=
+ ac_prev=
+ for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do
+ if test -n "$ac_prev"; then
+ case $ac_word in
+ i?86 | x86_64 | ppc | ppc64)
+ if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then
+ ac_arch=$ac_word
+ else
+ ac_cv_c_bigendian=universal
+ break
+ fi
+ ;;
+ esac
+ ac_prev=
+ elif test "x$ac_word" = "x-arch"; then
+ ac_prev=arch
+ fi
+ done
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ if test $ac_cv_c_bigendian = unknown; then
+ # See if sys/param.h defines the BYTE_ORDER macro.
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <sys/types.h>
+ #include <sys/param.h>
+
+int
+main ()
+{
+#if ! (defined BYTE_ORDER && defined BIG_ENDIAN \
+ && defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \
+ && LITTLE_ENDIAN)
+ bogus endian macros
+ #endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+ # It does; now see whether it defined to BIG_ENDIAN or not.
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <sys/types.h>
+ #include <sys/param.h>
+
+int
+main ()
+{
+#if BYTE_ORDER != BIG_ENDIAN
+ not big endian
+ #endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+ ac_cv_c_bigendian=yes
+else
+ ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ fi
+ if test $ac_cv_c_bigendian = unknown; then
+ # See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris).
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <limits.h>
+
+int
+main ()
+{
+#if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN)
+ bogus endian macros
+ #endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+ # It does; now see whether it defined to _BIG_ENDIAN or not.
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <limits.h>
+
+int
+main ()
+{
+#ifndef _BIG_ENDIAN
+ not big endian
+ #endif
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+ ac_cv_c_bigendian=yes
+else
+ ac_cv_c_bigendian=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ fi
+ if test $ac_cv_c_bigendian = unknown; then
+ # Compile a test program.
+ if test "$cross_compiling" = yes; then :
+ # Try to guess by grepping values from an object file.
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+short int ascii_mm[] =
+ { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 };
+ short int ascii_ii[] =
+ { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 };
+ int use_ascii (int i) {
+ return ascii_mm[i] + ascii_ii[i];
+ }
+ short int ebcdic_ii[] =
+ { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 };
+ short int ebcdic_mm[] =
+ { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 };
+ int use_ebcdic (int i) {
+ return ebcdic_mm[i] + ebcdic_ii[i];
+ }
+ extern int foo;
+
+int
+main ()
+{
+return use_ascii (foo) == use_ebcdic (foo);
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+ if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then
+ ac_cv_c_bigendian=yes
+ fi
+ if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then
+ if test "$ac_cv_c_bigendian" = unknown; then
+ ac_cv_c_bigendian=no
+ else
+ # finding both strings is unlikely to happen, but who knows?
+ ac_cv_c_bigendian=unknown
+ fi
+ fi
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$ac_includes_default
+int
+main ()
+{
+
+ /* Are we little or big endian? From Harbison&Steele. */
+ union
+ {
+ long int l;
+ char c[sizeof (long int)];
+ } u;
+ u.l = 1;
+ return u.c[sizeof (long int) - 1] == 1;
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_run "$LINENO"; then :
+ ac_cv_c_bigendian=no
+else
+ ac_cv_c_bigendian=yes
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+ conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+ fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5
+$as_echo "$ac_cv_c_bigendian" >&6; }
+ case $ac_cv_c_bigendian in #(
+ yes)
+ $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h
+;; #(
+ no)
+ ;; #(
+ universal)
+
+$as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h
+
+ ;; #(
+ *)
+ as_fn_error "unknown endianness
+ presetting ac_cv_c_bigendian=no (or yes) will help" "$LINENO" 5 ;;
+ esac
+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5
$as_echo_n "checking for an ANSI C-conforming const... " >&6; }
if test "${ac_cv_c_const+set}" = set; then :
@@ -5371,6 +5595,53 @@ _ACEOF
fi
+
+ ac_fn_cxx_check_type "$LINENO" "uintptr_t" "ac_cv_type_uintptr_t" "$ac_includes_default"
+if test "x$ac_cv_type_uintptr_t" = x""yes; then :
+
+$as_echo "#define HAVE_UINTPTR_T 1" >>confdefs.h
+
+else
+ for ac_type in 'unsigned int' 'unsigned long int' \
+ 'unsigned long long int'; do
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(sizeof (void *) <= sizeof ($ac_type))];
+test_array [0] = 0
+
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+
+cat >>confdefs.h <<_ACEOF
+#define uintptr_t $ac_type
+_ACEOF
+
+ ac_type=
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ test -z "$ac_type" && break
+ done
+fi
+
+
+ac_fn_cxx_check_type "$LINENO" "ptrdiff_t" "ac_cv_type_ptrdiff_t" "$ac_includes_default"
+if test "x$ac_cv_type_ptrdiff_t" = x""yes; then :
+
+else
+
+cat >>confdefs.h <<_ACEOF
+#define ptrdiff_t int
+_ACEOF
+
+fi
+
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether struct tm is in sys/time.h or time.h" >&5
$as_echo_n "checking whether struct tm is in sys/time.h or time.h... " >&6; }
if test "${ac_cv_struct_tm+set}" = set; then :
@@ -7042,6 +7313,7 @@ LTLIBOBJS=$ac_ltlibobjs
+
: ${CONFIG_STATUS=./config.status}
ac_write_fail=0
ac_clean_files_save=$ac_clean_files
@@ -70,12 +70,15 @@ else
fi
# Checks for typedefs, structures, and compiler characteristics.
+AC_C_BIGENDIAN
AC_C_CONST
AC_C_INLINE
AC_FUNC_OBSTACK
AC_TYPE_OFF_T
AC_TYPE_SIZE_T
-AC_CHECK_TYPE(ssize_t, int)
+AC_TYPE_SSIZE_T
+AC_TYPE_UINTPTR_T
+AC_CHECK_TYPE(ptrdiff_t, int)
AC_STRUCT_TM
AC_CHECK_SIZEOF(int)
AC_CHECK_SIZEOF(long)
@@ -137,6 +137,8 @@ init_library (void)
#ifdef ENABLE_NLS
(void) bindtextdomain (PACKAGE, LOCALEDIR);
#endif
+
+ init_vectorized_lexer ();
}
}
@@ -725,6 +725,8 @@ ufputs (const unsigned char *s, FILE *f)
return fputs ((const char *)s, f);
}
+extern void init_vectorized_lexer (void);
+
#ifdef __cplusplus
}
#endif
@@ -96,6 +96,442 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
buffer->notes_used++;
}
+
+/* Fast path to find line special characters using optimized character
+ scanning algorithms. Anything complicated falls back to the slow
+ path below. Since this loop is very hot it's worth doing these kinds
+ of optimizations.
+
+ One of the paths through the ifdefs should provide
+
+ bool search_line_fast (const uchar *s, const uchar *end,
+ const uchar **out)
+
+ Between S and END, search for \n, \r, \\, ?. Return true if found.
+ Always update *OUT to the last character scanned, even if not found. */
+
+/* Configure gives us an ifdef test. */
+#ifndef WORDS_BIGENDIAN
+#define WORDS_BIGENDIAN 0
+#endif
+
+/* We'd like the largest integer that fits into a register. There's nothing
+ in <stdint.h> that gives us that. For most hosts this is unsigned long,
+ but MS decided on an LLP64 model. Thankfully when building with GCC we
+ can get the "real" word size. */
+#ifdef __GNUC__
+typedef unsigned int word_type __attribute__((__mode__(__word__)));
+#else
+typedef unsigned long word_type;
+#endif
+
+/* The code below is only expecting sizes 4 or 8.
+ Die at compile-time if this expectation is violated. */
+typedef char check_word_type_size
+ [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
+
+/* Return X with the first N bytes forced to values that won't match one
+ of the interesting characters. Note that NUL is not interesting. */
+
+static inline word_type
+acc_char_mask_misalign (word_type val, unsigned int n)
+{
+ word_type mask = -1;
+ if (WORDS_BIGENDIAN)
+ mask >>= n * 8;
+ else
+ mask <<= n * 8;
+ return val & mask;
+}
+
+/* Return X replicated to all byte positions within WORD_TYPE. */
+
+static inline word_type
+acc_char_replicate (uchar x)
+{
+ word_type ret;
+
+ ret = (x << 24) | (x << 16) | (x << 8) | x;
+ if (sizeof(word_type) == 8)
+ ret = (ret << 16 << 16) | ret;
+ return ret;
+}
+
+/* Return non-zero if some byte of VAL is (probably) C. */
+
+static inline word_type
+acc_char_cmp (word_type val, word_type c)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+ /* We can get exact results using a compare-bytes instruction. Since
+ comparison is always >=, we could either do ((v >= c) & (c >= v))
+ for 3 operations or force matching bytes to zero and compare vs
+ zero with (0 >= (val ^ c)) for 2 operations. */
+ return __builtin_alpha_cmpbge (0, val ^ c);
+#elif defined(__GNUC__) && defined(__ia64__)
+ /* ??? Ideally we'd have some sort of builtin for this, so that we
+ can pack the 4 instances into fewer bundles. */
+ word_type ret;
+ __asm__("pcmp1.eq %0 = %1, %2" : "=r"(ret) : "r"(val), "r"(c));
+ return ret;
+#else
+ word_type magic = 0x7efefefeU;
+ if (sizeof(word_type) == 8)
+ magic = (magic << 16 << 16) | 0xfefefefeU;
+ magic |= 1;
+
+ val ^= c;
+ return ((val + magic) ^ ~val) & ~magic;
+#endif
+}
+
+/* Given the result of acc_char_cmp is non-zero, return the index of
+ the found character. If this was a false positive, return -1. */
+
+static inline int
+acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
+ word_type val ATTRIBUTE_UNUSED)
+{
+#if defined(__GNUC__) && defined(__alpha__)
+ /* The cmpbge instruction sets *bits* of the result corresponding to
+ matches in the bytes with no false positives. This means that ctz
+ produces a correct unscaled result. If big-endian (aka Cray), the
+ clz will include the 7 bytes before the least significant. */
+ if (WORDS_BIGENDIAN)
+ return __builtin_clzl (cmp) - 56;
+ else
+ return __builtin_ctzl (cmp);
+#elif defined(__GNUC__) && defined(__ia64__)
+ /* The pcmp1 instruction sets matching bytes to 0xff with no false
+ positives. This means that clz/ctz produces a correct scaled result. */
+ unsigned int i;
+ i = (WORDS_BIGENDIAN ? __builtin_clzl (cmp) : __builtin_ctzl (cmp));
+ return i / 8;
+#else
+ unsigned int i;
+
+ /* ??? It would be nice to force unrolling here,
+ and have all of these constants folded. */
+ for (i = 0; i < sizeof(word_type); ++i)
+ {
+ uchar c;
+ if (WORDS_BIGENDIAN)
+ c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
+ else
+ c = (val >> i * 8) & 0xff;
+
+ if (c == '\n' || c == '\r' || c == '\\' || c == '?')
+ return i;
+ }
+
+ return -1;
+#endif
+}
+
+/* A version of the fast scanner using bit fiddling techniques.
+
+ For 32-bit words, one would normally perform 16 comparisons and
+ 16 branches. With this algorithm one performs 24 arithmetic
+ operations and one branch. Whether this is faster with a 32-bit
+ word size is going to be somewhat system dependent.
+
+ For 64-bit words, we eliminate twice the number of comparisons
+ and branches without increasing the number of arithmetic operations.
+ It's almost certainly going to be a win with 64-bit word size. */
+
+static bool
+search_line_acc_char (const uchar *s, const uchar *end, const uchar **out)
+{
+ const word_type repl_nl = acc_char_replicate ('\n');
+ const word_type repl_cr = acc_char_replicate ('\r');
+ const word_type repl_bs = acc_char_replicate ('\\');
+ const word_type repl_qm = acc_char_replicate ('?');
+
+ unsigned int misalign;
+ ptrdiff_t left;
+ const word_type *p;
+ word_type val;
+
+ /* Don't bother optimizing very short lines; too much masking to do. */
+ left = end - s;
+ if (left < (ptrdiff_t) sizeof(word_type))
+ {
+ *out = s;
+ return false;
+ }
+
+ /* Align the buffer. Mask out any bytes from before the beginning. */
+ p = (word_type *)((uintptr_t)s & -sizeof(word_type));
+ val = *p;
+ misalign = (uintptr_t)s & (sizeof(word_type) - 1);
+ if (misalign)
+ {
+ val = acc_char_mask_misalign (val, misalign);
+ left += misalign;
+ }
+
+ /* Main loop. */
+ while (1)
+ {
+ word_type m_nl, m_cr, m_bs, m_qm, t;
+
+ m_nl = acc_char_cmp (val, repl_nl);
+ m_cr = acc_char_cmp (val, repl_cr);
+ m_bs = acc_char_cmp (val, repl_bs);
+ m_qm = acc_char_cmp (val, repl_qm);
+ t = (m_nl | m_cr) | (m_bs | m_qm);
+
+ if (__builtin_expect (t != 0, 0))
+ {
+ int i = acc_char_index (t, val);
+ if (i >= 0)
+ {
+ *out = (const uchar *)p + i;
+ return true;
+ }
+ }
+
+ left -= sizeof(word_type);
+ ++p;
+ if (left < (ptrdiff_t) sizeof(word_type))
+ {
+ /* Ran out of complete words. */
+ *out = (const uchar *)p;
+ return false;
+ }
+ val = *p;
+ }
+}
+
+#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__))
+/* A version of the fast scanner using SSE 4.2 vectorized string insns.
+
+ We should be using the _mm intrinsics, but the xxxintr headers do things
+ not allowed in gcc. So instead use direct builtins. */
+
+static bool
+#ifndef __SSE4_2__
+__attribute__((__target__("sse4.2")))
+#endif
+search_line_sse42 (const uchar *s, const uchar *end, const uchar **out)
+{
+ typedef char m128i __attribute__ ((__vector_size__ (16)));
+ static const m128i search
+ = { '\n', '\r', '?', '\\', 0,0,0,0,0,0,0,0,0,0,0,0 };
+
+ ptrdiff_t left;
+ m128i data;
+ int index;
+
+ /* Main loop, processing 16 bytes at a time. */
+ for (left = end - s; left >= 16; left -= 16, s += 16)
+ {
+ data = __builtin_ia32_loaddqu((const char *)s);
+ index = __builtin_ia32_pcmpestri128 (search, 4, data, 16, 0);
+ if (__builtin_expect (index < 16, 0))
+ {
+ *out = (const uchar *)s + index;
+ return true;
+ }
+ }
+
+ /* There are less than 16 bytes remaining. If we can read those bytes
+ without reading from a possibly unmapped next page, then go ahead and
+ do so. If we are near the end of the page then don't bother; returning
+ will scan the balance of the buffer byte-by-byte. */
+ if (left > 0 && ((uintptr_t)s & 0xfff) <= 0xff0)
+ {
+ data = __builtin_ia32_loaddqu((const char *)s);
+ index = __builtin_ia32_pcmpestri128 (search, 4, data, left, 0);
+ if (__builtin_expect (index < 16, 0))
+ {
+ *out = (const uchar *)s + index;
+ return true;
+ }
+ s = end;
+ }
+
+ /* No match within buffer. */
+ *out = s;
+ return false;
+}
+
+/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
+
+static bool
+#ifndef __SSE2__
+__attribute__((__target__("sse2")))
+#endif
+search_line_sse2 (const uchar *s, const uchar *end, const uchar **out)
+{
+ typedef char m128i __attribute__ ((__vector_size__ (16)));
+
+ static const m128i mask_align[16] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0 }
+ };
+
+ static const m128i repl_nl = {
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+ '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
+ };
+ static const m128i repl_cr = {
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+ '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
+ };
+ static const m128i repl_bs = {
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+ '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
+ };
+ static const m128i repl_qm = {
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ '?', '?', '?', '?', '?', '?', '?', '?',
+ };
+
+ ptrdiff_t left;
+ unsigned int misalign, mask;
+ const m128i *p;
+ m128i data, t;
+
+ left = end - s;
+ if (left <= 0)
+ goto no_match;
+
+ /* Align the pointer and mask out the bytes before the start. */
+ misalign = (uintptr_t)s & 15;
+ p = (const m128i *)((uintptr_t)s & -16);
+ data = *p;
+ if (misalign)
+ {
+ data |= mask_align[misalign];
+ left += misalign;
+ }
+
+ while (1)
+ {
+ t = __builtin_ia32_pcmpeqb128(data, repl_nl);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
+ t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
+ mask = __builtin_ia32_pmovmskb128 (t);
+
+ if (__builtin_expect (mask, 0))
+ {
+ /* Match found. If there were less than 16 bytes left in the
+ buffer, then we may have found a match beyond the end of
+ the buffer. Mask them out. */
+ if (left < 16)
+ {
+ mask &= (2u << left) - 1;
+ if (mask == 0)
+ goto no_match;
+ }
+
+ *out = (const uchar *)p + __builtin_ctz (mask);
+ return true;
+ }
+
+ left -= 16;
+ if (left <= 0)
+ goto no_match;
+ data = *++p;
+ }
+
+ no_match:
+ /* No match within buffer. */
+ *out = end;
+ return false;
+}
+
+/* Check the CPU capabilities. */
+
+#include "../gcc/config/i386/cpuid.h"
+
+/* 0 = sse4.2, 1 = sse2, 2 = integer only. */
+static int fast_impl;
+
+void
+init_vectorized_lexer (void)
+{
+ unsigned dummy, ecx, edx;
+ int impl = 2;
+
+ if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx))
+ {
+ if (ecx & bit_SSE4_2)
+ impl = 0;
+ else if (edx & bit_SSE2)
+ impl = 1;
+ }
+
+ fast_impl = impl;
+}
+
+/* Dispatch to one of the vectorized implementations based on the
+ capabilities of the cpu as detected above in init_vectorized_lexer
+ and as told to us by the compiler.
+
+ Not all x86 cpus have a branch predictor that can handle indirect
+ branches. It is reportedly better to have two very well predicted
+ direct branches instead.
+
+ Further, this may allow the vectorized implementations to be
+ inlined into _cpp_clean_line. */
+
+static inline bool
+search_line_fast (const uchar *s, const uchar *end, const uchar **out)
+{
+ int impl = fast_impl;
+
+ /* If the compiler tells us that a given ISA extension is available,
+ force the following branch to be taken. This allows the compiler
+ to elide the rest as unreachable code while avoiding more ifdefs. */
+#ifdef __SSE4_2__
+ impl = 0;
+#endif
+
+ if (impl == 0)
+ return search_line_sse42 (s, end, out);
+
+#ifdef __SSE2__
+ impl = 1;
+#endif
+
+ if (impl == 1)
+ return search_line_sse2 (s, end, out);
+ else
+ return search_line_acc_char (s, end, out);
+}
+
+#else
+
+/* We only have one accellerated alternative. Use a direct call so that
+ we encourage inlining. We must still provide the init_vectorized_lexer
+ entry point, even though it does nothing. */
+
+#define search_line_fast search_line_acc_char
+
+void
+init_vectorized_lexer (void)
+{
+}
+
+#endif
+
/* Returns with a logical line that contains no escaped newlines or
trigraphs. This is a time-critical inner loop. */
void
@@ -109,12 +545,34 @@ _cpp_clean_line (cpp_reader *pfile)
buffer->cur_note = buffer->notes_used = 0;
buffer->cur = buffer->line_base = buffer->next_line;
buffer->need_line = false;
- s = buffer->next_line - 1;
+ s = buffer->next_line;
if (!buffer->from_stage3)
{
const uchar *pbackslash = NULL;
+ found_bs:
+ /* Perform an optimized search for \n, \r, \\, ?. */
+ if (search_line_fast (s, buffer->rlimit, &s))
+ {
+ c = *s;
+
+ /* Special case for backslash which is reasonably common.
+ Continue searching using the fast path. */
+ if (c == '\\')
+ {
+ pbackslash = s;
+ s++;
+ goto found_bs;
+ }
+ if (__builtin_expect (c == '?', false))
+ goto found_qm;
+ else
+ goto found_nl_cr;
+ }
+
+ s--;
+
/* Short circuit for the common case of an un-escaped line with
no trigraphs. The primary win here is by not writing any
data back to memory until we have to. */
@@ -124,6 +582,7 @@ _cpp_clean_line (cpp_reader *pfile)
if (__builtin_expect (c == '\n', false)
|| __builtin_expect (c == '\r', false))
{
+ found_nl_cr:
d = (uchar *) s;
if (__builtin_expect (s == buffer->rlimit, false))
@@ -157,26 +616,28 @@ _cpp_clean_line (cpp_reader *pfile)
}
if (__builtin_expect (c == '\\', false))
pbackslash = s;
- else if (__builtin_expect (c == '?', false)
- && __builtin_expect (s[1] == '?', false)
- && _cpp_trigraph_map[s[2]])
+ else if (__builtin_expect (c == '?', false))
{
- /* Have a trigraph. We may or may not have to convert
- it. Add a line note regardless, for -Wtrigraphs. */
- add_line_note (buffer, s, s[2]);
- if (CPP_OPTION (pfile, trigraphs))
+ found_qm:
+ if (__builtin_expect (s[1] == '?', false)
+ && _cpp_trigraph_map[s[2]])
{
- /* We do, and that means we have to switch to the
- slow path. */
- d = (uchar *) s;
- *d = _cpp_trigraph_map[s[2]];
- s += 2;
- break;
+ /* Have a trigraph. We may or may not have to convert
+ it. Add a line note regardless, for -Wtrigraphs. */
+ add_line_note (buffer, s, s[2]);
+ if (CPP_OPTION (pfile, trigraphs))
+ {
+ /* We do, and that means we have to switch to the
+ slow path. */
+ d = (uchar *) s;
+ *d = _cpp_trigraph_map[s[2]];
+ s += 2;
+ break;
+ }
}
}
}
-
for (;;)
{
c = *++s;
@@ -184,7 +645,7 @@ _cpp_clean_line (cpp_reader *pfile)
if (c == '\n' || c == '\r')
{
- /* Handle DOS line endings. */
+ /* Handle DOS line endings. */
if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
s++;
if (s == buffer->rlimit)
@@ -215,9 +676,8 @@ _cpp_clean_line (cpp_reader *pfile)
}
else
{
- do
+ while (*s != '\n' && *s != '\r')
s++;
- while (*s != '\n' && *s != '\r');
d = (uchar *) s;
/* Handle DOS line endings. */
@@ -29,6 +29,9 @@ along with GCC; see the file COPYING3. If not see
#ifdef HAVE_STDDEF_H
# include <stddef.h>
#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
#include <stdio.h>