diff mbox series

[2/2] Add AVX2 code path to lexer

Message ID 20240730154159.3799008-2-ak@linux.intel.com
State New
Headers show
Series [1/2] Remove MMX code path in lexer | expand

Commit Message

Andi Kleen July 30, 2024, 3:41 p.m. UTC
From: Andi Kleen <ak@gcc.gnu.org>

AVX2 is widely available on x86 and it allows to do the scanner line
check with 32 bytes at a time. The code is similar to the SSE2 code
path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.

Also adjust the code to allow inlining when the compiler
is built for an AVX2 host, following what other architectures
do.

I see about a ~0.6% compile time improvement for compiling i386
insn-recog.i with -O0.

libcpp/ChangeLog:

	* config.in (HAVE_AVX2): Add.
	* configure: Regenerate.
	* configure.ac: Add HAVE_AVX2 check.
	* lex.cc (repl_chars): Extend to 32 bytes.
	(search_line_avx2): New function to scan line using AVX2.
	(init_vectorized_lexer): Check for AVX2 in CPUID.
---
 libcpp/config.in    |  3 ++
 libcpp/configure    | 17 +++++++++
 libcpp/configure.ac |  3 ++
 libcpp/lex.cc       | 91 +++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 110 insertions(+), 4 deletions(-)

Comments

Andrew Pinski July 30, 2024, 3:49 p.m. UTC | #1
On Tue, Jul 30, 2024 at 8:43 AM Andi Kleen <ak@linux.intel.com> wrote:
>
> From: Andi Kleen <ak@gcc.gnu.org>
>
> AVX2 is widely available on x86 and it allows to do the scanner line
> check with 32 bytes at a time. The code is similar to the SSE2 code
> path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.
>
> Also adjust the code to allow inlining when the compiler
> is built for an AVX2 host, following what other architectures
> do.
>
> I see about a ~0.6% compile time improvement for compiling i386
> insn-recog.i with -O0.
>
> libcpp/ChangeLog:
>
>         * config.in (HAVE_AVX2): Add.
>         * configure: Regenerate.
>         * configure.ac: Add HAVE_AVX2 check.
>         * lex.cc (repl_chars): Extend to 32 bytes.
>         (search_line_avx2): New function to scan line using AVX2.
>         (init_vectorized_lexer): Check for AVX2 in CPUID.
> ---
>  libcpp/config.in    |  3 ++
>  libcpp/configure    | 17 +++++++++
>  libcpp/configure.ac |  3 ++
>  libcpp/lex.cc       | 91 +++++++++++++++++++++++++++++++++++++++++++--
>  4 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/libcpp/config.in b/libcpp/config.in
> index 253ef03a3dea..8fad6bd4b4f5 100644
> --- a/libcpp/config.in
> +++ b/libcpp/config.in
> @@ -213,6 +213,9 @@
>  /* Define to 1 if you can assemble SSE4 insns. */
>  #undef HAVE_SSE4
>
> +/* Define to 1 if you can assemble AVX2 insns. */
> +#undef HAVE_AVX2
> +
>  /* Define to 1 if you have the <stddef.h> header file. */
>  #undef HAVE_STDDEF_H
>
> diff --git a/libcpp/configure b/libcpp/configure
> index 32d6aaa30699..6d9286ac9601 100755
> --- a/libcpp/configure
> +++ b/libcpp/configure
> @@ -9149,6 +9149,23 @@ if ac_fn_c_try_compile "$LINENO"; then :
>
>  $as_echo "#define HAVE_SSE4 1" >>confdefs.h
>
> +fi
> +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
> +    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
> +/* end confdefs.h.  */
> +
> +int
> +main ()
> +{
> +asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))
> +  ;
> +  return 0;
> +}
> +_ACEOF
> +if ac_fn_c_try_compile "$LINENO"; then :
> +
> +$as_echo "#define HAVE_AVX2 1" >>confdefs.h
> +
>  fi
>  rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
>  esac
> diff --git a/libcpp/configure.ac b/libcpp/configure.ac
> index b883fec776fe..c06609827924 100644
> --- a/libcpp/configure.ac
> +++ b/libcpp/configure.ac
> @@ -200,6 +200,9 @@ case $target in
>      AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))],
>        [AC_DEFINE([HAVE_SSE4], [1],
>                  [Define to 1 if you can assemble SSE4 insns.])])
> +    AC_TRY_COMPILE([], [asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))],
> +      [AC_DEFINE([HAVE_AVX2], [1],
> +                [Define to 1 if you can assemble AVX2 insns.])])
>  esac
>
>  # Enable --enable-host-shared.
> diff --git a/libcpp/lex.cc b/libcpp/lex.cc
> index 1591dcdf151a..72f3402aac99 100644
> --- a/libcpp/lex.cc
> +++ b/libcpp/lex.cc
> @@ -278,19 +278,31 @@ search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
>  /* Replicated character data to be shared between implementations.
>     Recall that outside of a context with vector support we can't
>     define compatible vector types, therefore these are all defined
> -   in terms of raw characters.  */
> -static const char repl_chars[4][16] __attribute__((aligned(16))) = {
> +   in terms of raw characters.
> +   gcc constant propagates this and usually turns it into a
> +   vector broadcast, so it actually disappears.  */
> +
> +static const char repl_chars[4][32] __attribute__((aligned(32))) = {
>    { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
> +    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
> +    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
>      '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
>    { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
> +    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
> +    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
>      '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
>    { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
> +    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
> +    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
>      '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
>    { '?', '?', '?', '?', '?', '?', '?', '?',
> +    '?', '?', '?', '?', '?', '?', '?', '?',
> +    '?', '?', '?', '?', '?', '?', '?', '?',
>      '?', '?', '?', '?', '?', '?', '?', '?' },
>  };
>
>
> +#ifndef __AVX2__
>  /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
>
>  static const uchar *
> @@ -343,8 +355,9 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
>    found = __builtin_ctz(found);
>    return (const uchar *)p + found;
>  }
> +#endif
>
> -#ifdef HAVE_SSE4
> +#if defined(HAVE_SSE4) && !defined(__AVX2__)
>  /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
>
>  static const uchar *
> @@ -425,6 +438,71 @@ search_line_sse42 (const uchar *s, const uchar *end)
>  #define search_line_sse42 search_line_sse2
>  #endif
>
> +#ifdef HAVE_AVX2
> +
> +/* A version of the fast scanner using AVX2 vectorized byte compare insns.  */
> +
> +static const uchar *
> +#ifndef __AVX2__
> +__attribute__((__target__("avx2")))
> +#endif
> +search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
> +{
> +  typedef char v32qi __attribute__ ((__vector_size__ (32)));
> +
> +  const v32qi repl_nl = *(const v32qi *)repl_chars[0];
> +  const v32qi repl_cr = *(const v32qi *)repl_chars[1];
> +  const v32qi repl_bs = *(const v32qi *)repl_chars[2];
> +  const v32qi repl_qm = *(const v32qi *)repl_chars[3];
> +
> +  unsigned int misalign, found, mask;
> +  const v32qi *p;
> +  v32qi data, t;
> +
> +  /* Align the source pointer.  */
> +  misalign = (uintptr_t)s & 31;
> +  p = (const v32qi *)((uintptr_t)s & -32);
> +  data = *p;
> +
> +  /* Create a mask for the bytes that are valid within the first
> +     32-byte block.  The Idea here is that the AND with the mask
> +     within the loop is "free", since we need some AND or TEST
> +     insn in order to set the flags for the branch anyway.  */
> +  mask = -1u << misalign;
> +
> +  /* Main loop processing 32 bytes at a time.  */
> +  goto start;
> +  do
> +    {
> +      data = *++p;
> +      mask = -1;
> +
> +    start:
> +      t  = data == repl_nl;
> +      t |= data == repl_cr;
> +      t |= data == repl_bs;
> +      t |= data == repl_qm;
> +      found = __builtin_ia32_pmovmskb256 (t);

Using the builtin here seems wrong. Why not use the intrinsic
_mm256_movemask_epi8 ?
Oh I noticed that there is other similar builtin uses for the x86 case.
Also it might make sense to remove the MMX version.

Thanks,
Andrew Pinski



> +      found &= mask;
> +    }
> +  while (!found);
> +
> +  /* FOUND contains 1 in bits for which we matched a relevant
> +     character.  Conversion to the byte index is trivial.  */
> +  found = __builtin_ctz (found);
> +  return (const uchar *)p + found;
> +}
> +
> +#else
> +#define search_line_avx2 search_line_sse2
> +#endif
> +
> +#ifdef __AVX2__
> +/* Avoid indirect calls to encourage inlining if the compiler is built
> +   using AVX.  */
> +#define search_line_fast search_line_avx2
> +#else
> +
>  /* Check the CPU capabilities.  */
>
>  #include "../gcc/config/i386/cpuid.h"
> @@ -436,7 +514,7 @@ static search_line_fast_type search_line_fast;
>  static inline void
>  init_vectorized_lexer (void)
>  {
> -  unsigned dummy, ecx = 0, edx = 0;
> +  unsigned dummy, ecx = 0, edx = 0, ebx = 0;
>    search_line_fast_type impl = search_line_acc_char;
>    int minimum = 0;
>
> @@ -448,6 +526,10 @@ init_vectorized_lexer (void)
>
>    if (minimum == 3)
>      impl = search_line_sse42;
> +  else if (__get_cpuid_max (0, &dummy) >= 7
> +              && __get_cpuid_count (7, 0, &dummy, &ebx, &dummy, &dummy)
> +              && (ebx & bit_AVX2))
> +    impl = search_line_avx2;
>    else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
>      {
>        if (minimum == 3 || (ecx & bit_SSE4_2))
> @@ -458,6 +540,7 @@ init_vectorized_lexer (void)
>
>    search_line_fast = impl;
>  }
> +#endif /* !__AVX2__ */
>
>  #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
>
> --
> 2.45.2
>
Andi Kleen July 30, 2024, 3:56 p.m. UTC | #2
Andrew Pinski <pinskia@gmail.com> writes:
>
> Using the builtin here seems wrong. Why not use the intrinsic
> _mm256_movemask_epi8 ?

I followed the rest of the vectorized code paths. The original reason was that
there was some incompatibility of the intrinsic header with the source
build. I don't know if it's still true, but I guess it doesn't hurt.

> Also it might make sense to remove the MMX version.

See the previous patch.

-Andi
Alexander Monakov July 30, 2024, 4:32 p.m. UTC | #3
Hi,

On Tue, 30 Jul 2024, Andi Kleen wrote:

> AVX2 is widely available on x86 and it allows to do the scanner line
> check with 32 bytes at a time. The code is similar to the SSE2 code
> path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.
> 
> Also adjust the code to allow inlining when the compiler
> is built for an AVX2 host, following what other architectures
> do.
> 
> I see about a ~0.6% compile time improvement for compiling i386
> insn-recog.i with -O0.

Is that from some kind of rigorous measurement under perf? As you
surely know, 0.6% wall-clock time can be from boost clock variation
or just run-to-run noise on x86.

I have looked at this code before. When AVX2 is available, so is SSSE3,
and then a much more efficient approach is available: instead of comparing
against \r \n \\ ? one-by-one, build a vector

  0  1  2  3  4  5  6  7  8  9    a   b    c     d   e   f
{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }

where each character C we're seeking is at position (C % 16). Then
you can match against them all at once using PSHUFB:

  t = _mm_shuffle_epi8 (lut, data);
  t = t == data;

As you might recognize this handily beats the fancy SSE4.1 loop as well.
I did not pursue this because I did not measure a substantial improvement
(we're way into the land of diminishing returns here) and it seemed like
maintainers might not like to be distracted with that, but if we are
touching this code, might as well use the more efficient algorithm.
I'll be happy to propose a patch if people think it's worthwhile.

I see one issue with your patch, please see below:

> @@ -448,6 +526,10 @@ init_vectorized_lexer (void)
>  
>    if (minimum == 3)
>      impl = search_line_sse42;
> +  else if (__get_cpuid_max (0, &dummy) >= 7
> +	       && __get_cpuid_count (7, 0, &dummy, &ebx, &dummy, &dummy)
> +	       && (ebx & bit_AVX2))
> +    impl = search_line_avx2;
>    else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
>      {
>        if (minimum == 3 || (ecx & bit_SSE4_2))

Surely this is not enough? You're not checking OS support via xgetbv.

Alexander
Jakub Jelinek July 30, 2024, 4:45 p.m. UTC | #4
On Tue, Jul 30, 2024 at 08:41:59AM -0700, Andi Kleen wrote:
> From: Andi Kleen <ak@gcc.gnu.org>
> 
> AVX2 is widely available on x86 and it allows to do the scanner line
> check with 32 bytes at a time. The code is similar to the SSE2 code
> path, just using AVX and 32 bytes at a time instead of SSE2 16 bytes.
> 
> Also adjust the code to allow inlining when the compiler
> is built for an AVX2 host, following what other architectures
> do.
> 
> I see about a ~0.6% compile time improvement for compiling i386
> insn-recog.i with -O0.
> 
> libcpp/ChangeLog:
> 
> 	* config.in (HAVE_AVX2): Add.
> 	* configure: Regenerate.
> 	* configure.ac: Add HAVE_AVX2 check.
> 	* lex.cc (repl_chars): Extend to 32 bytes.
> 	(search_line_avx2): New function to scan line using AVX2.
> 	(init_vectorized_lexer): Check for AVX2 in CPUID.

I'd like to just mention that there in libcpp/files.cc (read_file_guts)
we have
  /* The + 16 here is space for the final '\n' and 15 bytes of padding,
     used to quiet warnings from valgrind or Address Sanitizer, when the
     optimized lexer accesses aligned 16-byte memory chunks, including
     the bytes after the malloced, area, and stops lexing on '\n'.  */
  buf = XNEWVEC (uchar, size + 16);
So, if for AVX2 we handle 32 bytes at a time rather than 16 this would
need to change (at least conditionally for arches where the AVX2 code could
be used).

	Jakub
Andi Kleen July 30, 2024, 5:01 p.m. UTC | #5
> Is that from some kind of rigorous measurement under perf? As you
> surely know, 0.6% wall-clock time can be from boost clock variation
> or just run-to-run noise on x86.

I compared it using hyperfine which does rigorous measurements yes.
It was well above the run-to-run variability.

I had some other patches that didn't meet that bar, e.g. 
i've been experimenting with more modern hashes for inchash
and multiple ggc free lists, but so far no above noise
results.

> 
> I have looked at this code before. When AVX2 is available, so is SSSE3,
> and then a much more efficient approach is available: instead of comparing
> against \r \n \\ ? one-by-one, build a vector
> 
>   0  1  2  3  4  5  6  7  8  9    a   b    c     d   e   f
> { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }
> 
> where each character C we're seeking is at position (C % 16). Then
> you can match against them all at once using PSHUFB:
> 
>   t = _mm_shuffle_epi8 (lut, data);
>   t = t == data;

I thought the PSHUFB trick only worked for some bit patterns?

At least according to this paper: https://arxiv.org/pdf/1902.08318

But yes if it applies here it's a good idea.


> 
> As you might recognize this handily beats the fancy SSE4.1 loop as well.
> I did not pursue this because I did not measure a substantial improvement
> (we're way into the land of diminishing returns here) and it seemed like
> maintainers might not like to be distracted with that, but if we are
> touching this code, might as well use the more efficient algorithm.
> I'll be happy to propose a patch if people think it's worthwhile.

Yes makes sense.

(of course it would be even better to teach the vectorizer about it,
although this will require fixing some other issues first, see PR116126)

-Andi
Alexander Monakov July 30, 2024, 5:22 p.m. UTC | #6
On Tue, 30 Jul 2024, Andi Kleen wrote:
> > I have looked at this code before. When AVX2 is available, so is SSSE3,
> > and then a much more efficient approach is available: instead of comparing
> > against \r \n \\ ? one-by-one, build a vector
> > 
> >   0  1  2  3  4  5  6  7  8  9    a   b    c     d   e   f
> > { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }
> > 
> > where each character C we're seeking is at position (C % 16). Then
> > you can match against them all at once using PSHUFB:
> > 
> >   t = _mm_shuffle_epi8 (lut, data);
> >   t = t == data;
> 
> I thought the PSHUFB trick only worked for some bit patterns?
> 
> At least according to this paper: https://arxiv.org/pdf/1902.08318
> 
> But yes if it applies here it's a good idea.

I wouldn't mention it if it did not apply.

> > As you might recognize this handily beats the fancy SSE4.1 loop as well.
> > I did not pursue this because I did not measure a substantial improvement
> > (we're way into the land of diminishing returns here) and it seemed like
> > maintainers might not like to be distracted with that, but if we are
> > touching this code, might as well use the more efficient algorithm.
> > I'll be happy to propose a patch if people think it's worthwhile.
> 
> Yes makes sense.

Okay, so what are the next steps here? Can someone who could eventually
supply a review indicate their buy-in for switching our SSE4.1 routine
for the SSSE3 PSHUFB-based one? And then for the 256-bit variant, assuming
it still brings an improvement over the faster PSHUFB scanner?

> (of course it would be even better to teach the vectorizer about it,
> although this will require fixing some other issues first, see PR116126)

(I disagree, FWIW)

(and you trimmed the part about XGETBV)

Alexander
Richard Biener July 30, 2024, 5:47 p.m. UTC | #7
> Am 30.07.2024 um 19:22 schrieb Alexander Monakov <amonakov@ispras.ru>:
> 
> 
> On Tue, 30 Jul 2024, Andi Kleen wrote:
>>> I have looked at this code before. When AVX2 is available, so is SSSE3,
>>> and then a much more efficient approach is available: instead of comparing
>>> against \r \n \\ ? one-by-one, build a vector
>>> 
>>>  0  1  2  3  4  5  6  7  8  9    a   b    c     d   e   f
>>> { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }
>>> 
>>> where each character C we're seeking is at position (C % 16). Then
>>> you can match against them all at once using PSHUFB:
>>> 
>>>  t = _mm_shuffle_epi8 (lut, data);
>>>  t = t == data;
>> 
>> I thought the PSHUFB trick only worked for some bit patterns?
>> 
>> At least according to this paper: https://arxiv.org/pdf/1902.08318
>> 
>> But yes if it applies here it's a good idea.
> 
> I wouldn't mention it if it did not apply.
> 
>>> As you might recognize this handily beats the fancy SSE4.1 loop as well.
>>> I did not pursue this because I did not measure a substantial improvement
>>> (we're way into the land of diminishing returns here) and it seemed like
>>> maintainers might not like to be distracted with that, but if we are
>>> touching this code, might as well use the more efficient algorithm.
>>> I'll be happy to propose a patch if people think it's worthwhile.
>> 
>> Yes makes sense.
> 
> Okay, so what are the next steps here? Can someone who could eventually
> supply a review indicate their buy-in for switching our SSE4.1 routine
> for the SSSE3 PSHUFB-based one? And then for the 256-bit variant, assuming
> it still brings an improvement over the faster PSHUFB scanner?

I’ll happily approve such change.

>> (of course it would be even better to teach the vectorizer about it,
>> although this will require fixing some other issues first, see PR116126)
> 
> (I disagree, FWIW)

I also think writing optimized code with intrinsics is fine.

Richard 

> (and you trimmed the part about XGETBV)
> 
> Alexander
Kyrylo Tkachov July 30, 2024, 6:54 p.m. UTC | #8
> On 30 Jul 2024, at 19:01, Andi Kleen <andi@firstfloor.org> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
>> Is that from some kind of rigorous measurement under perf? As you
>> surely know, 0.6% wall-clock time can be from boost clock variation
>> or just run-to-run noise on x86.
> 
> I compared it using hyperfine which does rigorous measurements yes.
> It was well above the run-to-run variability.

FWIW when I was experimenting with these paths I found that an -fsyntax-only compilation helps make the changes more pronounced.

Thanks,
Kyrill

> 
> I had some other patches that didn't meet that bar, e.g.
> i've been experimenting with more modern hashes for inchash
> and multiple ggc free lists, but so far no above noise
> results.
> 
>> 
>> I have looked at this code before. When AVX2 is available, so is SSSE3,
>> and then a much more efficient approach is available: instead of comparing
>> against \r \n \\ ? one-by-one, build a vector
>> 
>>  0  1  2  3  4  5  6  7  8  9    a   b    c     d   e   f
>> { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }
>> 
>> where each character C we're seeking is at position (C % 16). Then
>> you can match against them all at once using PSHUFB:
>> 
>>  t = _mm_shuffle_epi8 (lut, data);
>>  t = t == data;
> 
> I thought the PSHUFB trick only worked for some bit patterns?
> 
> At least according to this paper: https://arxiv.org/pdf/1902.08318
> 
> But yes if it applies here it's a good idea.
> 
> 
>> 
>> As you might recognize this handily beats the fancy SSE4.1 loop as well.
>> I did not pursue this because I did not measure a substantial improvement
>> (we're way into the land of diminishing returns here) and it seemed like
>> maintainers might not like to be distracted with that, but if we are
>> touching this code, might as well use the more efficient algorithm.
>> I'll be happy to propose a patch if people think it's worthwhile.
> 
> Yes makes sense.
> 
> (of course it would be even better to teach the vectorizer about it,
> although this will require fixing some other issues first, see PR116126)
> 
> -Andi
diff mbox series

Patch

diff --git a/libcpp/config.in b/libcpp/config.in
index 253ef03a3dea..8fad6bd4b4f5 100644
--- a/libcpp/config.in
+++ b/libcpp/config.in
@@ -213,6 +213,9 @@ 
 /* Define to 1 if you can assemble SSE4 insns. */
 #undef HAVE_SSE4
 
+/* Define to 1 if you can assemble AVX2 insns. */
+#undef HAVE_AVX2
+
 /* Define to 1 if you have the <stddef.h> header file. */
 #undef HAVE_STDDEF_H
 
diff --git a/libcpp/configure b/libcpp/configure
index 32d6aaa30699..6d9286ac9601 100755
--- a/libcpp/configure
+++ b/libcpp/configure
@@ -9149,6 +9149,23 @@  if ac_fn_c_try_compile "$LINENO"; then :
 
 $as_echo "#define HAVE_SSE4 1" >>confdefs.h
 
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+$as_echo "#define HAVE_AVX2 1" >>confdefs.h
+
 fi
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 esac
diff --git a/libcpp/configure.ac b/libcpp/configure.ac
index b883fec776fe..c06609827924 100644
--- a/libcpp/configure.ac
+++ b/libcpp/configure.ac
@@ -200,6 +200,9 @@  case $target in
     AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))],
       [AC_DEFINE([HAVE_SSE4], [1],
 		 [Define to 1 if you can assemble SSE4 insns.])])
+    AC_TRY_COMPILE([], [asm ("vpcmpeqb %%ymm0, %%ymm4, %%ymm5" : : "i"(0))],
+      [AC_DEFINE([HAVE_AVX2], [1],
+		 [Define to 1 if you can assemble AVX2 insns.])])
 esac
 
 # Enable --enable-host-shared.
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 1591dcdf151a..72f3402aac99 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -278,19 +278,31 @@  search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
 /* Replicated character data to be shared between implementations.
    Recall that outside of a context with vector support we can't
    define compatible vector types, therefore these are all defined
-   in terms of raw characters.  */
-static const char repl_chars[4][16] __attribute__((aligned(16))) = {
+   in terms of raw characters.
+   gcc constant propagates this and usually turns it into a
+   vector broadcast, so it actually disappears.  */
+
+static const char repl_chars[4][32] __attribute__((aligned(32))) = {
   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
+    '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
+    '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
+    '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
   { '?', '?', '?', '?', '?', '?', '?', '?',
+    '?', '?', '?', '?', '?', '?', '?', '?',
+    '?', '?', '?', '?', '?', '?', '?', '?',
     '?', '?', '?', '?', '?', '?', '?', '?' },
 };
 
 
+#ifndef __AVX2__
 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 
 static const uchar *
@@ -343,8 +355,9 @@  search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
   found = __builtin_ctz(found);
   return (const uchar *)p + found;
 }
+#endif
 
-#ifdef HAVE_SSE4
+#if defined(HAVE_SSE4) && !defined(__AVX2__)
 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
 
 static const uchar *
@@ -425,6 +438,71 @@  search_line_sse42 (const uchar *s, const uchar *end)
 #define search_line_sse42 search_line_sse2
 #endif
 
+#ifdef HAVE_AVX2
+
+/* A version of the fast scanner using AVX2 vectorized byte compare insns.  */
+
+static const uchar *
+#ifndef __AVX2__
+__attribute__((__target__("avx2")))
+#endif
+search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
+{
+  typedef char v32qi __attribute__ ((__vector_size__ (32)));
+
+  const v32qi repl_nl = *(const v32qi *)repl_chars[0];
+  const v32qi repl_cr = *(const v32qi *)repl_chars[1];
+  const v32qi repl_bs = *(const v32qi *)repl_chars[2];
+  const v32qi repl_qm = *(const v32qi *)repl_chars[3];
+
+  unsigned int misalign, found, mask;
+  const v32qi *p;
+  v32qi data, t;
+
+  /* Align the source pointer.  */
+  misalign = (uintptr_t)s & 31;
+  p = (const v32qi *)((uintptr_t)s & -32);
+  data = *p;
+
+  /* Create a mask for the bytes that are valid within the first
+     32-byte block.  The Idea here is that the AND with the mask
+     within the loop is "free", since we need some AND or TEST
+     insn in order to set the flags for the branch anyway.  */
+  mask = -1u << misalign;
+
+  /* Main loop processing 32 bytes at a time.  */
+  goto start;
+  do
+    {
+      data = *++p;
+      mask = -1;
+
+    start:
+      t  = data == repl_nl;
+      t |= data == repl_cr;
+      t |= data == repl_bs;
+      t |= data == repl_qm;
+      found = __builtin_ia32_pmovmskb256 (t);
+      found &= mask;
+    }
+  while (!found);
+
+  /* FOUND contains 1 in bits for which we matched a relevant
+     character.  Conversion to the byte index is trivial.  */
+  found = __builtin_ctz (found);
+  return (const uchar *)p + found;
+}
+
+#else
+#define search_line_avx2 search_line_sse2
+#endif
+
+#ifdef __AVX2__
+/* Avoid indirect calls to encourage inlining if the compiler is built
+   using AVX.  */
+#define search_line_fast search_line_avx2
+#else
+
 /* Check the CPU capabilities.  */
 
 #include "../gcc/config/i386/cpuid.h"
@@ -436,7 +514,7 @@  static search_line_fast_type search_line_fast;
 static inline void
 init_vectorized_lexer (void)
 {
-  unsigned dummy, ecx = 0, edx = 0;
+  unsigned dummy, ecx = 0, edx = 0, ebx = 0;
   search_line_fast_type impl = search_line_acc_char;
   int minimum = 0;
 
@@ -448,6 +526,10 @@  init_vectorized_lexer (void)
 
   if (minimum == 3)
     impl = search_line_sse42;
+  else if (__get_cpuid_max (0, &dummy) >= 7
+	       && __get_cpuid_count (7, 0, &dummy, &ebx, &dummy, &dummy)
+	       && (ebx & bit_AVX2))
+    impl = search_line_avx2;
   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
     {
       if (minimum == 3 || (ecx & bit_SSE4_2))
@@ -458,6 +540,7 @@  init_vectorized_lexer (void)
 
   search_line_fast = impl;
 }
+#endif /* !__AVX2__ */
 
 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)