diff mbox series

[1/2] Remove MMX code path in lexer

Message ID 20240730154159.3799008-1-ak@linux.intel.com
State New
Headers show
Series [1/2] Remove MMX code path in lexer | expand

Commit Message

Andi Kleen July 30, 2024, 3:41 p.m. UTC
From: Andi Kleen <ak@gcc.gnu.org>

Host systems with only MMX and no SSE2 should be really rare now.
Let's remove the MMX code path to keep the number of custom
implementations the same.

The SSE2 code path is also somewhat dubious now (nearly everything
should have SSE4 4.2 which is >15 years old now), but the SSE2
code path is used as fallback for others and also apparently
Solaris uses it due to tool chain deficiencies.

libcpp/ChangeLog:

	* lex.cc (search_line_mmx): Remove function.
	(init_vectorized_lexer): Remove search_line_mmx.
---
 libcpp/lex.cc | 75 ---------------------------------------------------
 1 file changed, 75 deletions(-)

Comments

Richard Biener July 31, 2024, 7:38 a.m. UTC | #1
On Tue, Jul 30, 2024 at 5:43 PM Andi Kleen <ak@linux.intel.com> wrote:
>
> From: Andi Kleen <ak@gcc.gnu.org>
>
> Host systems with only MMX and no SSE2 should be really rare now.
> Let's remove the MMX code path to keep the number of custom
> implementations the same.
>
> The SSE2 code path is also somewhat dubious now (nearly everything
> should have SSE4 4.2 which is >15 years old now), but the SSE2
> code path is used as fallback for others and also apparently
> Solaris uses it due to tool chain deficiencies.

OK if nobody objects this week.

Thanks,
Richard.

> libcpp/ChangeLog:
>
>         * lex.cc (search_line_mmx): Remove function.
>         (init_vectorized_lexer): Remove search_line_mmx.
> ---
>  libcpp/lex.cc | 75 ---------------------------------------------------
>  1 file changed, 75 deletions(-)
>
> diff --git a/libcpp/lex.cc b/libcpp/lex.cc
> index 16f2c23af1e1..1591dcdf151a 100644
> --- a/libcpp/lex.cc
> +++ b/libcpp/lex.cc
> @@ -290,71 +290,6 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = {
>      '?', '?', '?', '?', '?', '?', '?', '?' },
>  };
>
> -/* A version of the fast scanner using MMX vectorized byte compare insns.
> -
> -   This uses the PMOVMSKB instruction which was introduced with "MMX2",
> -   which was packaged into SSE1; it is also present in the AMD MMX
> -   extension.  Mark the function as using "sse" so that we emit a real
> -   "emms" instruction, rather than the 3dNOW "femms" instruction.  */
> -
> -static const uchar *
> -#ifndef __SSE__
> -__attribute__((__target__("sse")))
> -#endif
> -search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
> -{
> -  typedef char v8qi __attribute__ ((__vector_size__ (8)));
> -  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
> -
> -  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
> -  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
> -  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
> -  const v8qi repl_qm = *(const v8qi *)repl_chars[3];
> -
> -  unsigned int misalign, found, mask;
> -  const v8qi *p;
> -  v8qi data, t, c;
> -
> -  /* Align the source pointer.  While MMX doesn't generate unaligned data
> -     faults, this allows us to safely scan to the end of the buffer without
> -     reading beyond the end of the last page.  */
> -  misalign = (uintptr_t)s & 7;
> -  p = (const v8qi *)((uintptr_t)s & -8);
> -  data = *p;
> -
> -  /* Create a mask for the bytes that are valid within the first
> -     16-byte block.  The Idea here is that the AND with the mask
> -     within the loop is "free", since we need some AND or TEST
> -     insn in order to set the flags for the branch anyway.  */
> -  mask = -1u << misalign;
> -
> -  /* Main loop processing 8 bytes at a time.  */
> -  goto start;
> -  do
> -    {
> -      data = *++p;
> -      mask = -1;
> -
> -    start:
> -      t = __builtin_ia32_pcmpeqb(data, repl_nl);
> -      c = __builtin_ia32_pcmpeqb(data, repl_cr);
> -      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
> -      c = __builtin_ia32_pcmpeqb(data, repl_bs);
> -      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
> -      c = __builtin_ia32_pcmpeqb(data, repl_qm);
> -      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
> -      found = __builtin_ia32_pmovmskb (t);
> -      found &= mask;
> -    }
> -  while (!found);
> -
> -  __builtin_ia32_emms ();
> -
> -  /* FOUND contains 1 in bits for which we matched a relevant
> -     character.  Conversion to the byte index is trivial.  */
> -  found = __builtin_ctz(found);
> -  return (const uchar *)p + found;
> -}
>
>  /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
>
> @@ -509,8 +444,6 @@ init_vectorized_lexer (void)
>    minimum = 3;
>  #elif defined(__SSE2__)
>    minimum = 2;
> -#elif defined(__SSE__)
> -  minimum = 1;
>  #endif
>
>    if (minimum == 3)
> @@ -521,14 +454,6 @@ init_vectorized_lexer (void)
>          impl = search_line_sse42;
>        else if (minimum == 2 || (edx & bit_SSE2))
>         impl = search_line_sse2;
> -      else if (minimum == 1 || (edx & bit_SSE))
> -       impl = search_line_mmx;
> -    }
> -  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
> -    {
> -      if (minimum == 1
> -         || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
> -       impl = search_line_mmx;
>      }
>
>    search_line_fast = impl;
> --
> 2.45.2
>
diff mbox series

Patch

diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index 16f2c23af1e1..1591dcdf151a 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -290,71 +290,6 @@  static const char repl_chars[4][16] __attribute__((aligned(16))) = {
     '?', '?', '?', '?', '?', '?', '?', '?' },
 };
 
-/* A version of the fast scanner using MMX vectorized byte compare insns.
-
-   This uses the PMOVMSKB instruction which was introduced with "MMX2",
-   which was packaged into SSE1; it is also present in the AMD MMX
-   extension.  Mark the function as using "sse" so that we emit a real
-   "emms" instruction, rather than the 3dNOW "femms" instruction.  */
-
-static const uchar *
-#ifndef __SSE__
-__attribute__((__target__("sse")))
-#endif
-search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
-{
-  typedef char v8qi __attribute__ ((__vector_size__ (8)));
-  typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
-
-  const v8qi repl_nl = *(const v8qi *)repl_chars[0];
-  const v8qi repl_cr = *(const v8qi *)repl_chars[1];
-  const v8qi repl_bs = *(const v8qi *)repl_chars[2];
-  const v8qi repl_qm = *(const v8qi *)repl_chars[3];
-
-  unsigned int misalign, found, mask;
-  const v8qi *p;
-  v8qi data, t, c;
-
-  /* Align the source pointer.  While MMX doesn't generate unaligned data
-     faults, this allows us to safely scan to the end of the buffer without
-     reading beyond the end of the last page.  */
-  misalign = (uintptr_t)s & 7;
-  p = (const v8qi *)((uintptr_t)s & -8);
-  data = *p;
-
-  /* Create a mask for the bytes that are valid within the first
-     16-byte block.  The Idea here is that the AND with the mask
-     within the loop is "free", since we need some AND or TEST
-     insn in order to set the flags for the branch anyway.  */
-  mask = -1u << misalign;
-
-  /* Main loop processing 8 bytes at a time.  */
-  goto start;
-  do
-    {
-      data = *++p;
-      mask = -1;
-
-    start:
-      t = __builtin_ia32_pcmpeqb(data, repl_nl);
-      c = __builtin_ia32_pcmpeqb(data, repl_cr);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      c = __builtin_ia32_pcmpeqb(data, repl_bs);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      c = __builtin_ia32_pcmpeqb(data, repl_qm);
-      t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
-      found = __builtin_ia32_pmovmskb (t);
-      found &= mask;
-    }
-  while (!found);
-
-  __builtin_ia32_emms ();
-
-  /* FOUND contains 1 in bits for which we matched a relevant
-     character.  Conversion to the byte index is trivial.  */
-  found = __builtin_ctz(found);
-  return (const uchar *)p + found;
-}
 
 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
 
@@ -509,8 +444,6 @@  init_vectorized_lexer (void)
   minimum = 3;
 #elif defined(__SSE2__)
   minimum = 2;
-#elif defined(__SSE__)
-  minimum = 1;
 #endif
 
   if (minimum == 3)
@@ -521,14 +454,6 @@  init_vectorized_lexer (void)
         impl = search_line_sse42;
       else if (minimum == 2 || (edx & bit_SSE2))
 	impl = search_line_sse2;
-      else if (minimum == 1 || (edx & bit_SSE))
-	impl = search_line_mmx;
-    }
-  else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
-    {
-      if (minimum == 1
-	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
-	impl = search_line_mmx;
     }
 
   search_line_fast = impl;