[v8] Vectorized _cpp_clean_line

Message ID	4C702671.2090204@redhat.com
State	New
Headers	show Return-Path: <gcc-patches-return-271129-incoming=patchwork.ozlabs.org@gcc.gnu.org> Message-ID: <4C702671.2090204@redhat.com> Date: Sat, 21 Aug 2010 12:18:09 -0700 From: Richard Henderson <rth@redhat.com> User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.7) Gecko/20100720 Fedora/3.1.1-1.fc13 Thunderbird/3.1.1 MIME-Version: 1.0 To: GCC Patches <gcc-patches@gcc.gnu.org> CC: Andi Kleen <ak@linux.intel.com>, "David S. Miller" <davem@davemloft.net> Subject: [PATCH, v8] Vectorized _cpp_clean_line Content-Type: multipart/mixed; boundary="------------090106040300020109030308" Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

diff --git a/libcpp/config.in b/libcpp/config.in index 9969934..95606c1 100644 --- a/libcpp/config.in +++ b/libcpp/config.in @@ -1,5 +1,8 @@ /* config.in. Generated from configure.ac by autoheader. */ +/* Define if building universal (internal helper macro) */ +#undef AC_APPLE_UNIVERSAL_BUILD + /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP systems. This function is required for `alloca.c' support on those systems. */ @@ -209,6 +212,9 @@ /* Define if <sys/types.h> defines \`uchar'. */ #undef HAVE_UCHAR +/* Define to 1 if the system has the type `uintptr_t'. */ +#undef HAVE_UINTPTR_T + /* Define to 1 if you have the <unistd.h> header file. */ #undef HAVE_UNISTD_H @@ -266,6 +272,18 @@ /* Define to 1 if your <sys/time.h> declares `struct tm'. */ #undef TM_IN_SYS_TIME +/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most + significant byte first (like Motorola and SPARC, unlike Intel). */ +#if defined AC_APPLE_UNIVERSAL_BUILD +# if defined __BIG_ENDIAN__ +# define WORDS_BIGENDIAN 1 +# endif +#else +# ifndef WORDS_BIGENDIAN +# undef WORDS_BIGENDIAN +# endif +#endif + /* Define to empty if `const' does not conform to ANSI C. */ #undef const @@ -278,8 +296,15 @@ /* Define to `long int' if <sys/types.h> does not define. */ #undef off_t +/* Define to `int' if <sys/types.h> does not define. */ +#undef ptrdiff_t + /* Define to `unsigned int' if <sys/types.h> does not define. */ #undef size_t /* Define to `int' if <sys/types.h> does not define. */ #undef ssize_t + +/* Define to the type of an unsigned integer type wide enough to hold a + pointer, if such a type exists, and if the system does not define it. */ +#undef uintptr_t diff --git a/libcpp/configure b/libcpp/configure index a4700e6..a2ce1c3 100755 --- a/libcpp/configure +++ b/libcpp/configure @@ -1846,6 +1846,48 @@ fi } # ac_fn_cxx_check_header_mongrel +# ac_fn_cxx_try_run LINENO +# ------------------------ +# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes +# that executables *can* be run. +ac_fn_cxx_try_run () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + ac_retval=0 +else + $as_echo "$as_me: program exited with status $ac_status" >&5 + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=$ac_status +fi + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;} + return $ac_retval + +} # ac_fn_cxx_try_run + # ac_fn_cxx_try_link LINENO # ------------------------- # Try to link conftest.$ac_ext, and return whether this succeeded. @@ -1946,48 +1988,6 @@ $as_echo "$ac_res" >&6; } } # ac_fn_cxx_check_type -# ac_fn_cxx_try_run LINENO -# ------------------------ -# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes -# that executables *can* be run. -ac_fn_cxx_try_run () -{ - as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - if { { ac_try="$ac_link" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_link") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' - { { case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" -$as_echo "$ac_try_echo"; } >&5 - (eval "$ac_try") 2>&5 - ac_status=$? - $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 - test $ac_status = 0; }; }; then : - ac_retval=0 -else - $as_echo "$as_me: program exited with status $ac_status" >&5 - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_retval=$ac_status -fi - rm -rf conftest.dSYM conftest_ipa8_conftest.oo - eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;} - return $ac_retval - -} # ac_fn_cxx_try_run - # ac_fn_cxx_compute_int LINENO EXPR VAR INCLUDES # ---------------------------------------------- # Tries to find the compile-time value of EXPR in a program that includes @@ -5172,6 +5172,230 @@ done fi # Checks for typedefs, structures, and compiler characteristics. + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5 +$as_echo_n "checking whether byte ordering is bigendian... " >&6; } +if test "${ac_cv_c_bigendian+set}" = set; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_c_bigendian=unknown + # See if we're dealing with a universal compiler. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifndef __APPLE_CC__ + not a universal capable compiler + #endif + typedef int dummy; + +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + + # Check for potential -arch flags. It is not universal unless + # there are at least two -arch flags with different values. + ac_arch= + ac_prev= + for ac_word in $CC $CFLAGS $CPPFLAGS $LDFLAGS; do + if test -n "$ac_prev"; then + case $ac_word in + i?86 | x86_64 | ppc | ppc64) + if test -z "$ac_arch" || test "$ac_arch" = "$ac_word"; then + ac_arch=$ac_word + else + ac_cv_c_bigendian=universal + break + fi + ;; + esac + ac_prev= + elif test "x$ac_word" = "x-arch"; then + ac_prev=arch + fi + done +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + if test $ac_cv_c_bigendian = unknown; then + # See if sys/param.h defines the BYTE_ORDER macro. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <sys/types.h> + #include <sys/param.h> + +int +main () +{ +#if ! (defined BYTE_ORDER && defined BIG_ENDIAN \ + && defined LITTLE_ENDIAN && BYTE_ORDER && BIG_ENDIAN \ + && LITTLE_ENDIAN) + bogus endian macros + #endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + # It does; now see whether it defined to BIG_ENDIAN or not. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <sys/types.h> + #include <sys/param.h> + +int +main () +{ +#if BYTE_ORDER != BIG_ENDIAN + not big endian + #endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_c_bigendian=yes +else + ac_cv_c_bigendian=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + fi + if test $ac_cv_c_bigendian = unknown; then + # See if <limits.h> defines _LITTLE_ENDIAN or _BIG_ENDIAN (e.g., Solaris). + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <limits.h> + +int +main () +{ +#if ! (defined _LITTLE_ENDIAN || defined _BIG_ENDIAN) + bogus endian macros + #endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + # It does; now see whether it defined to _BIG_ENDIAN or not. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <limits.h> + +int +main () +{ +#ifndef _BIG_ENDIAN + not big endian + #endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_c_bigendian=yes +else + ac_cv_c_bigendian=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + fi + if test $ac_cv_c_bigendian = unknown; then + # Compile a test program. + if test "$cross_compiling" = yes; then : + # Try to guess by grepping values from an object file. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +short int ascii_mm[] = + { 0x4249, 0x4765, 0x6E44, 0x6961, 0x6E53, 0x7953, 0 }; + short int ascii_ii[] = + { 0x694C, 0x5454, 0x656C, 0x6E45, 0x6944, 0x6E61, 0 }; + int use_ascii (int i) { + return ascii_mm[i] + ascii_ii[i]; + } + short int ebcdic_ii[] = + { 0x89D3, 0xE3E3, 0x8593, 0x95C5, 0x89C4, 0x9581, 0 }; + short int ebcdic_mm[] = + { 0xC2C9, 0xC785, 0x95C4, 0x8981, 0x95E2, 0xA8E2, 0 }; + int use_ebcdic (int i) { + return ebcdic_mm[i] + ebcdic_ii[i]; + } + extern int foo; + +int +main () +{ +return use_ascii (foo) == use_ebcdic (foo); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + if grep BIGenDianSyS conftest.$ac_objext >/dev/null; then + ac_cv_c_bigendian=yes + fi + if grep LiTTleEnDian conftest.$ac_objext >/dev/null ; then + if test "$ac_cv_c_bigendian" = unknown; then + ac_cv_c_bigendian=no + else + # finding both strings is unlikely to happen, but who knows? + ac_cv_c_bigendian=unknown + fi + fi +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_includes_default +int +main () +{ + + /* Are we little or big endian? From Harbison&Steele. */ + union + { + long int l; + char c[sizeof (long int)]; + } u; + u.l = 1; + return u.c[sizeof (long int) - 1] == 1; + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_run "$LINENO"; then : + ac_cv_c_bigendian=no +else + ac_cv_c_bigendian=yes +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_bigendian" >&5 +$as_echo "$ac_cv_c_bigendian" >&6; } + case $ac_cv_c_bigendian in #( + yes) + $as_echo "#define WORDS_BIGENDIAN 1" >>confdefs.h +;; #( + no) + ;; #( + universal) + +$as_echo "#define AC_APPLE_UNIVERSAL_BUILD 1" >>confdefs.h + + ;; #( + *) + as_fn_error "unknown endianness + presetting ac_cv_c_bigendian=no (or yes) will help" "$LINENO" 5 ;; + esac + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5 $as_echo_n "checking for an ANSI C-conforming const... " >&6; } if test "${ac_cv_c_const+set}" = set; then : @@ -5371,6 +5595,53 @@ _ACEOF fi + + ac_fn_cxx_check_type "$LINENO" "uintptr_t" "ac_cv_type_uintptr_t" "$ac_includes_default" +if test "x$ac_cv_type_uintptr_t" = x""yes; then : + +$as_echo "#define HAVE_UINTPTR_T 1" >>confdefs.h + +else + for ac_type in 'unsigned int' 'unsigned long int' \ + 'unsigned long long int'; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$ac_includes_default +int +main () +{ +static int test_array [1 - 2 * !(sizeof (void *) <= sizeof ($ac_type))]; +test_array [0] = 0 + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + +cat >>confdefs.h <<_ACEOF +#define uintptr_t $ac_type +_ACEOF + + ac_type= +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test -z "$ac_type" && break + done +fi + + +ac_fn_cxx_check_type "$LINENO" "ptrdiff_t" "ac_cv_type_ptrdiff_t" "$ac_includes_default" +if test "x$ac_cv_type_ptrdiff_t" = x""yes; then : + +else + +cat >>confdefs.h <<_ACEOF +#define ptrdiff_t int +_ACEOF + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether struct tm is in sys/time.h or time.h" >&5 $as_echo_n "checking whether struct tm is in sys/time.h or time.h... " >&6; } if test "${ac_cv_struct_tm+set}" = set; then : @@ -7042,6 +7313,7 @@ LTLIBOBJS=$ac_ltlibobjs + : ${CONFIG_STATUS=./config.status} ac_write_fail=0 ac_clean_files_save=$ac_clean_files diff --git a/libcpp/configure.ac b/libcpp/configure.ac index ceea29c..1250f49 100644 --- a/libcpp/configure.ac +++ b/libcpp/configure.ac @@ -70,12 +70,15 @@ else fi # Checks for typedefs, structures, and compiler characteristics. +AC_C_BIGENDIAN AC_C_CONST AC_C_INLINE AC_FUNC_OBSTACK AC_TYPE_OFF_T AC_TYPE_SIZE_T -AC_CHECK_TYPE(ssize_t, int) +AC_TYPE_SSIZE_T +AC_TYPE_UINTPTR_T +AC_CHECK_TYPE(ptrdiff_t, int) AC_STRUCT_TM AC_CHECK_SIZEOF(int) AC_CHECK_SIZEOF(long) diff --git a/libcpp/lex.c b/libcpp/lex.c index f628272..bc0086d 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -1,5 +1,5 @@ /* CPP Library - lexical analysis. - Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009 + Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. Contributed by Per Bothner, 1994-95. Based on CCCP program by Paul Rubin, June 1986 @@ -96,6 +96,531 @@ add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type) buffer->notes_used++; } + +/* Fast path to find line special characters using optimized character + scanning algorithms. Anything complicated falls back to the slow + path below. Since this loop is very hot it's worth doing these kinds + of optimizations. + + One of the paths through the ifdefs should provide + + const uchar *search_line_fast (const uchar *s, const uchar *end); + + Between S and END, search for \n, \r, \\, ?. Return a pointer to + the found character. + + Note that the last character of the buffer is *always* a newline, + as forced by _cpp_convert_input. This fact can be used to avoid + explicitly looking for the end of the buffer. */ + +/* Configure gives us an ifdef test. */ +#ifndef WORDS_BIGENDIAN +#define WORDS_BIGENDIAN 0 +#endif + +/* We'd like the largest integer that fits into a register. There's nothing + in <stdint.h> that gives us that. For most hosts this is unsigned long, + but MS decided on an LLP64 model. Thankfully when building with GCC we + can get the "real" word size. */ +#ifdef __GNUC__ +typedef unsigned int word_type __attribute__((__mode__(__word__))); +#else +typedef unsigned long word_type; +#endif + +/* The code below is only expecting sizes 4 or 8. + Die at compile-time if this expectation is violated. */ +typedef char check_word_type_size + [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1]; + +/* Return X with the first N bytes forced to values that won't match one + of the interesting characters. Note that NUL is not interesting. */ + +static inline word_type +acc_char_mask_misalign (word_type val, unsigned int n) +{ + word_type mask = -1; + if (WORDS_BIGENDIAN) + mask >>= n * 8; + else + mask <<= n * 8; + return val & mask; +} + +/* Return X replicated to all byte positions within WORD_TYPE. */ + +static inline word_type +acc_char_replicate (uchar x) +{ + word_type ret; + + ret = (x << 24) | (x << 16) | (x << 8) | x; + if (sizeof(word_type) == 8) + ret = (ret << 16 << 16) | ret; + return ret; +} + +/* Return non-zero if some byte of VAL is (probably) C. */ + +static inline word_type +acc_char_cmp (word_type val, word_type c) +{ +#if defined(__GNUC__) && defined(__alpha__) + /* We can get exact results using a compare-bytes instruction. + Get (val == c) via (0 >= (val ^ c)). */ + return __builtin_alpha_cmpbge (0, val ^ c); +#else + word_type magic = 0x7efefefeU; + if (sizeof(word_type) == 8) + magic = (magic << 16 << 16) | 0xfefefefeU; + magic |= 1; + + val ^= c; + return ((val + magic) ^ ~val) & ~magic; +#endif +} + +/* Given the result of acc_char_cmp is non-zero, return the index of + the found character. If this was a false positive, return -1. */ + +static inline int +acc_char_index (word_type cmp ATTRIBUTE_UNUSED, + word_type val ATTRIBUTE_UNUSED) +{ +#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN + /* The cmpbge instruction sets *bits* of the result corresponding to + matches in the bytes with no false positives. */ + return __builtin_ctzl (cmp); +#else + unsigned int i; + + /* ??? It would be nice to force unrolling here, + and have all of these constants folded. */ + for (i = 0; i < sizeof(word_type); ++i) + { + uchar c; + if (WORDS_BIGENDIAN) + c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff; + else + c = (val >> i * 8) & 0xff; + + if (c == '\n' || c == '\r' || c == '\\' || c == '?') + return i; + } + + return -1; +#endif +} + +/* A version of the fast scanner using bit fiddling techniques. + + For 32-bit words, one would normally perform 16 comparisons and + 16 branches. With this algorithm one performs 24 arithmetic + operations and one branch. Whether this is faster with a 32-bit + word size is going to be somewhat system dependent. + + For 64-bit words, we eliminate twice the number of comparisons + and branches without increasing the number of arithmetic operations. + It's almost certainly going to be a win with 64-bit word size. */ + +static const uchar * search_line_acc_char (const uchar *, const uchar *) + ATTRIBUTE_UNUSED; + +static const uchar * +search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + const word_type repl_nl = acc_char_replicate ('\n'); + const word_type repl_cr = acc_char_replicate ('\r'); + const word_type repl_bs = acc_char_replicate ('\\'); + const word_type repl_qm = acc_char_replicate ('?'); + + unsigned int misalign; + const word_type *p; + word_type val, t; + + /* Align the buffer. Mask out any bytes from before the beginning. */ + p = (word_type *)((uintptr_t)s & -sizeof(word_type)); + val = *p; + misalign = (uintptr_t)s & (sizeof(word_type) - 1); + if (misalign) + val = acc_char_mask_misalign (val, misalign); + + /* Main loop. */ + while (1) + { + t = acc_char_cmp (val, repl_nl); + t |= acc_char_cmp (val, repl_cr); + t |= acc_char_cmp (val, repl_bs); + t |= acc_char_cmp (val, repl_qm); + + if (__builtin_expect (t != 0, 0)) + { + int i = acc_char_index (t, val); + if (i >= 0) + return (const uchar *)p + i; + } + + val = *++p; + } +} + +#if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) + +/* Replicated character data to be shared between implementations. + Recall that outside of a context with vector support we can't + define compatible vector types, therefore these are all defined + in terms of raw characters. */ +static const char repl_chars[4][16] __attribute__((aligned(16))) = { + { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }, + { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', + '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' }, + { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', + '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }, + { '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?' }, +}; + +/* A version of the fast scanner using MMX vectorized byte compare insns. + + This uses the PMOVMSKB instruction which was introduced with "MMX2", + which was packaged into SSE1; it is also present in the AMD 3dNOW-A + extension. Mark the function as using "sse" so that we emit a real + "emms" instruction, rather than the 3dNOW "femms" instruction. */ + +static const uchar * +#ifndef __SSE__ +__attribute__((__target__("sse"))) +#endif +search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + typedef char v8qi __attribute__ ((__vector_size__ (8))); + typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); + + const v8qi repl_nl = *(const v8qi *)repl_chars[0]; + const v8qi repl_cr = *(const v8qi *)repl_chars[1]; + const v8qi repl_bs = *(const v8qi *)repl_chars[2]; + const v8qi repl_qm = *(const v8qi *)repl_chars[3]; + + unsigned int misalign, found, mask; + const v8qi *p; + v8qi data, t, c; + + /* Align the source pointer. While MMX doesn't generate unaligned data + faults, this allows us to safely scan to the end of the buffer without + reading beyond the end of the last page. */ + misalign = (uintptr_t)s & 7; + p = (const v8qi *)((uintptr_t)s & -8); + data = *p; + + /* Create a mask for the bytes that are valid within the first + 16-byte block. The Idea here is that the AND with the mask + within the loop is "free", since we need some AND or TEST + insn in order to set the flags for the branch anyway. */ + mask = -1u << misalign; + + /* Main loop processing 8 bytes at a time. */ + goto start; + do + { + data = *++p; + mask = -1; + + start: + t = __builtin_ia32_pcmpeqb(data, repl_nl); + c = __builtin_ia32_pcmpeqb(data, repl_cr); + t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); + c = __builtin_ia32_pcmpeqb(data, repl_bs); + t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); + c = __builtin_ia32_pcmpeqb(data, repl_qm); + t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); + found = __builtin_ia32_pmovmskb (t); + found &= mask; + } + while (!found); + + __builtin_ia32_emms (); + + /* FOUND contains 1 in bits for which we matched a relevant + character. Conversion to the byte index is trivial. */ + found = __builtin_ctz(found); + return (const uchar *)p + found; +} + +/* A version of the fast scanner using SSE2 vectorized byte compare insns. */ + +static const uchar * +#ifndef __SSE2__ +__attribute__((__target__("sse2"))) +#endif +search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + typedef char v16qi __attribute__ ((__vector_size__ (16))); + + const v16qi repl_nl = *(const v16qi *)repl_chars[0]; + const v16qi repl_cr = *(const v16qi *)repl_chars[1]; + const v16qi repl_bs = *(const v16qi *)repl_chars[2]; + const v16qi repl_qm = *(const v16qi *)repl_chars[3]; + + unsigned int misalign, found, mask; + const v16qi *p; + v16qi data, t; + + /* Align the source pointer. */ + misalign = (uintptr_t)s & 15; + p = (const v16qi *)((uintptr_t)s & -16); + data = *p; + + /* Create a mask for the bytes that are valid within the first + 16-byte block. The Idea here is that the AND with the mask + within the loop is "free", since we need some AND or TEST + insn in order to set the flags for the branch anyway. */ + mask = -1u << misalign; + + /* Main loop processing 16 bytes at a time. */ + goto start; + do + { + data = *++p; + mask = -1; + + start: + t = __builtin_ia32_pcmpeqb128(data, repl_nl); + t |= __builtin_ia32_pcmpeqb128(data, repl_cr); + t |= __builtin_ia32_pcmpeqb128(data, repl_bs); + t |= __builtin_ia32_pcmpeqb128(data, repl_qm); + found = __builtin_ia32_pmovmskb128 (t); + found &= mask; + } + while (!found); + + /* FOUND contains 1 in bits for which we matched a relevant + character. Conversion to the byte index is trivial. */ + found = __builtin_ctz(found); + return (const uchar *)p + found; +} + +/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ + +static const uchar * +#ifndef __SSE4_2__ +__attribute__((__target__("sse4.2"))) +#endif +search_line_sse42 (const uchar *s, const uchar *end) +{ + typedef char v16qi __attribute__ ((__vector_size__ (16))); + static const v16qi search = { '\n', '\r', '?', '\\' }; + + uintptr_t si = (uintptr_t)s; + uintptr_t index; + + /* Check for unaligned input. */ + if (si & 15) + { + if (__builtin_expect (end - s < 16, 0) + && __builtin_expect ((si & 0xfff) > 0xff0, 0)) + { + /* There are less than 16 bytes left in the buffer, and less + than 16 bytes left on the page. Reading 16 bytes at this + point might generate a spurious page fault. Defer to the + SSE2 implementation, which already handles alignment. */ + return search_line_sse2 (s, end); + } + + /* ??? The builtin doesn't understand that the PCMPESTRI read from + memory need not be aligned. */ + __asm ("%vpcmpestri $0, (%1), %2" + : "=c"(index) : "r"(s), "x"(search), "a"(4), "d"(16)); + if (__builtin_expect (index < 16, 0)) + goto found; + + /* Advance the pointer to an aligned address. We will re-scan a + few bytes, but we no longer need care for reading past the + end of a page, since we're guaranteed a match. */ + s = (const uchar *)((si + 16) & -16); + } + + /* Main loop, processing 16 bytes at a time. By doing the whole loop + in inline assembly, we can make proper use of the flags set. */ + __asm ( "sub $16, %1\n" + " .balign 16\n" + "0: add $16, %1\n" + " %vpcmpestri $0, (%1), %2\n" + " jnc 0b" + : "=&c"(index), "+r"(s) + : "x"(search), "a"(4), "d"(16)); + + found: + return s + index; +} + +/* Check the CPU capabilities. */ + +#include "../gcc/config/i386/cpuid.h" + +typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); +static search_line_fast_type search_line_fast; + +static void __attribute__((constructor)) +init_vectorized_lexer (void) +{ + unsigned dummy, ecx = 0, edx = 0; + search_line_fast_type impl = search_line_acc_char; + int minimum = 0; + +#if defined(__SSE4_2__) + minimum = 3; +#elif defined(__SSE2__) + minimum = 2; +#elif defined(__SSE__) || defined(__3dNOW_A__) + minimum = 1; +#endif + + if (minimum == 3) + impl = search_line_sse42; + else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) + { + if (minimum == 3 || (ecx & bit_SSE4_2)) + impl = search_line_sse42; + else if (minimum == 2 || (edx & bit_SSE2)) + impl = search_line_sse2; + else if (minimum == 1 || (edx & bit_SSE)) + impl = search_line_mmx; + } + else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx)) + { + if (minimum == 1 || edx & bit_3DNOWP) + impl = search_line_mmx; + } + + search_line_fast = impl; +} + +#elif defined(__GNUC__) && defined(__ALTIVEC__) + +/* A vection of the fast scanner using AltiVec vectorized byte compares. */ +/* ??? Unfortunately, attribute(target("altivec")) is not yet supported, + so we can't compile this function without -maltivec on the command line + (or implied by some other switch). */ + +static const uchar * +search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + typedef __attribute__((altivec(vector))) unsigned char vc; + + const vc repl_nl = { + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' + }; + const vc repl_cr = { + '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', + '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' + }; + const vc repl_bs = { + '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', + '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' + }; + const vc repl_qm = { + '?', '?', '?', '?', '?', '?', '?', '?', + '?', '?', '?', '?', '?', '?', '?', '?', + }; + const vc ones = { + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + }; + const vc zero = { 0 }; + + vc data, mask, t; + + /* Altivec loads automatically mask addresses with -16. This lets us + issue the first load as early as possible. */ + data = __builtin_vec_ld(0, (const vc *)s); + + /* Discard bytes before the beginning of the buffer. Do this by + beginning with all ones and shifting in zeros according to the + mis-alignment. The LVSR instruction pulls the exact shift we + want from the address. */ + mask = __builtin_vec_lvsr(0, s); + mask = __builtin_vec_perm(zero, ones, mask); + data &= mask; + + /* While altivec loads mask addresses, we still need to align S so + that the offset we compute at the end is correct. */ + s = (const uchar *)((uintptr_t)s & -16); + + /* Main loop processing 16 bytes at a time. */ + goto start; + do + { + vc m_nl, m_cr, m_bs, m_qm; + + s += 16; + data = __builtin_vec_ld(0, (const vc *)s); + + start: + m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); + m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr); + m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs); + m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm); + t = (m_nl | m_cr) | (m_bs | m_qm); + + /* T now contains 0xff in bytes for which we matched one of the relevant + characters. We want to exit the loop if any byte in T is non-zero. + Below is the expansion of vec_any_ne(t, zero). */ + } + while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero)); + + { +#define N (sizeof(vc) / sizeof(long)) + + typedef char check_count[(N == 2 || N == 4) * 2 - 1]; + union { + vc v; + unsigned long l[N]; + } u; + unsigned long l, i = 0; + + u.v = t; + + /* Find the first word of T that is non-zero. */ + switch (N) + { + case 4: + l = u.l[i++]; + if (l != 0) + break; + s += sizeof(unsigned long); + l = u.l[i++]; + if (l != 0) + break; + s += sizeof(unsigned long); + case 2: + l = u.l[i++]; + if (l != 0) + break; + s += sizeof(unsigned long); + l = u.l[i]; + } + + /* L now contains 0xff in bytes for which we matched one of the + relevant characters. We can find the byte index by finding + its bit index and dividing by 8. */ + l = __builtin_clzl(l) >> 3; + return s + l; + +#undef N + } +} + +#else + +/* We only have one accellerated alternative. Use a direct call so that + we encourage inlining. */ + +#define search_line_fast search_line_acc_char + +#endif + /* Returns with a logical line that contains no escaped newlines or trigraphs. This is a time-critical inner loop. */ void @@ -109,82 +634,91 @@ _cpp_clean_line (cpp_reader *pfile) buffer->cur_note = buffer->notes_used = 0; buffer->cur = buffer->line_base = buffer->next_line; buffer->need_line = false; - s = buffer->next_line - 1; + s = buffer->next_line; if (!buffer->from_stage3) { const uchar *pbackslash = NULL; - /* Short circuit for the common case of an un-escaped line with + /* Fast path. This is the common case of an un-escaped line with no trigraphs. The primary win here is by not writing any data back to memory until we have to. */ - for (;;) + while (1) { - c = *++s; - if (__builtin_expect (c == '\n', false) - || __builtin_expect (c == '\r', false)) - { - d = (uchar *) s; + /* Perform an optimized search for \n, \r, \\, ?. */ + s = search_line_fast (s, buffer->rlimit); - if (__builtin_expect (s == buffer->rlimit, false)) - goto done; - - /* DOS line ending? */ - if (__builtin_expect (c == '\r', false) - && s[1] == '\n') - { - s++; - if (s == buffer->rlimit) - goto done; - } - - if (__builtin_expect (pbackslash == NULL, true)) - goto done; - - /* Check for escaped newline. */ - p = d; - while (is_nvspace (p[-1])) - p--; - if (p - 1 != pbackslash) - goto done; - - /* Have an escaped newline; process it and proceed to - the slow path. */ - add_line_note (buffer, p - 1, p != d ? ' ' : '\\'); - d = p - 2; - buffer->next_line = p - 1; - break; + c = *s; + if (c == '\\') + { + /* Record the location of the backslash and continue. */ + pbackslash = s++; } - if (__builtin_expect (c == '\\', false)) - pbackslash = s; - else if (__builtin_expect (c == '?', false) - && __builtin_expect (s[1] == '?', false) - && _cpp_trigraph_map[s[2]]) + else if (__builtin_expect (c == '?', 0)) { - /* Have a trigraph. We may or may not have to convert - it. Add a line note regardless, for -Wtrigraphs. */ - add_line_note (buffer, s, s[2]); - if (CPP_OPTION (pfile, trigraphs)) + if (__builtin_expect (s[1] == '?', false) + && _cpp_trigraph_map[s[2]]) { - /* We do, and that means we have to switch to the - slow path. */ - d = (uchar *) s; - *d = _cpp_trigraph_map[s[2]]; - s += 2; - break; + /* Have a trigraph. We may or may not have to convert + it. Add a line note regardless, for -Wtrigraphs. */ + add_line_note (buffer, s, s[2]); + if (CPP_OPTION (pfile, trigraphs)) + { + /* We do, and that means we have to switch to the + slow path. */ + d = (uchar *) s; + *d = _cpp_trigraph_map[s[2]]; + s += 2; + goto slow_path; + } } + /* Not a trigraph. Continue on fast-path. */ + s++; } + else + break; } + /* This must be \r or \n. We're either done, or we'll be forced + to write back to the buffer and continue on the slow path. */ + d = (uchar *) s; + + if (__builtin_expect (s == buffer->rlimit, false)) + goto done; + + /* DOS line ending? */ + if (__builtin_expect (c == '\r', false) && s[1] == '\n') + { + s++; + if (s == buffer->rlimit) + goto done; + } + + if (__builtin_expect (pbackslash == NULL, true)) + goto done; + + /* Check for escaped newline. */ + p = d; + while (is_nvspace (p[-1])) + p--; + if (p - 1 != pbackslash) + goto done; + + /* Have an escaped newline; process it and proceed to + the slow path. */ + add_line_note (buffer, p - 1, p != d ? ' ' : '\\'); + d = p - 2; + buffer->next_line = p - 1; - for (;;) + slow_path: + while (1) { c = *++s; *++d = c; if (c == '\n' || c == '\r') { - /* Handle DOS line endings. */ + /* Handle DOS line endings. */ if (c == '\r' && s != buffer->rlimit && s[1] == '\n') s++; if (s == buffer->rlimit) @@ -215,9 +749,8 @@ _cpp_clean_line (cpp_reader *pfile) } else { - do + while (*s != '\n' && *s != '\r') s++; - while (*s != '\n' && *s != '\r'); d = (uchar *) s; /* Handle DOS line endings. */ diff --git a/libcpp/system.h b/libcpp/system.h index 2472799..1a74734 100644 --- a/libcpp/system.h +++ b/libcpp/system.h @@ -29,6 +29,9 @@ along with GCC; see the file COPYING3. If not see #ifdef HAVE_STDDEF_H # include <stddef.h> #endif +#ifdef HAVE_STDINT_H +# include <stdint.h> +#endif #include <stdio.h>

[v8] Vectorized _cpp_clean_line

Commit Message

Comments

Patch