diff mbox series

[x86] Add STV support for DImode and SImode rotations by constant.

Message ID 016001d9ab24$9388d1d0$ba9a7570$@nextmovesoftware.com
State New
Headers show
Series [x86] Add STV support for DImode and SImode rotations by constant. | expand

Commit Message

Roger Sayle June 30, 2023, 7:29 a.m. UTC
This patch implements scalar-to-vector (STV) support for DImode and SImode
rotations by constant bit counts.  Scalar rotations are almost always
optimal on x86, requiring only one or two instructions, but it is also
possible to implement these efficiently with SSE2, requiring only one
or two instructions for SImode rotations and at most 3 instructions for
DImode rotations.  This allows GCC to STV rotations with a small or no
penalty if there are other (net) benefits to converting a chain.  An
example of the benefits is shown below, which is based upon the BLAKE2
cryptographic hash function:

unsigned long long a,b,c,d;

unsigned long rot(unsigned long long x, int y)
{
  return (x<<y) | (x>>(64-y));
}

void foo()
{
  d = rot(d ^ a,32);
  c = c + d;
  b = rot(b ^ c,24);
  a = a + b;
  d = rot(d ^ a,16);
  c = c + d;
  b = rot(b ^ c,63);
}

where with -m32 -O2 -msse2

Before (59 insns, 247 bytes):
foo:    pushl   %edi
        xorl    %edx, %edx
        pushl   %esi
        pushl   %ebx
        subl    $16, %esp
        movq    a, %xmm1
        movq    d, %xmm0
        movq    b, %xmm2
        pxor    %xmm1, %xmm0
        psrlq   $32, %xmm0
        movd    %xmm0, %eax
        movd    %edx, %xmm0
        movd    %eax, %xmm3
        punpckldq       %xmm0, %xmm3
        movq    c, %xmm0
        paddq   %xmm3, %xmm0
        pxor    %xmm0, %xmm2
        movd    %xmm2, %ecx
        psrlq   $32, %xmm2
        movd    %xmm2, %ebx
        movl    %ecx, %eax
        shldl   $24, %ebx, %ecx
        shldl   $24, %eax, %ebx
        movd    %ebx, %xmm4
        movd    %ecx, %xmm2
        punpckldq       %xmm4, %xmm2
        movdqa  .LC0, %xmm4
        pand    %xmm4, %xmm2
        paddq   %xmm2, %xmm1
        movq    %xmm1, a
        pxor    %xmm3, %xmm1
        movd    %xmm1, %esi
        psrlq   $32, %xmm1
        movd    %xmm1, %edi
        movl    %esi, %eax
        shldl   $16, %edi, %esi
        shldl   $16, %eax, %edi
        movd    %esi, %xmm1
        movd    %edi, %xmm3
        punpckldq       %xmm3, %xmm1
        pand    %xmm4, %xmm1
        movq    %xmm1, d
        paddq   %xmm1, %xmm0
        movq    %xmm0, c
        pxor    %xmm2, %xmm0
        movd    %xmm0, 8(%esp)
        psrlq   $32, %xmm0
        movl    8(%esp), %eax
        movd    %xmm0, 12(%esp)
        movl    12(%esp), %edx
        shrdl   $1, %edx, %eax
        xorl    %edx, %edx
        movl    %eax, b
        movl    %edx, b+4
        addl    $16, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        ret

After (32 insns, 165 bytes):
        movq    a, %xmm1
        xorl    %edx, %edx
        movq    d, %xmm0
        movq    b, %xmm2
        movdqa  .LC0, %xmm4
        pxor    %xmm1, %xmm0
        psrlq   $32, %xmm0
        movd    %xmm0, %eax
        movd    %edx, %xmm0
        movd    %eax, %xmm3
        punpckldq       %xmm0, %xmm3
        movq    c, %xmm0
        paddq   %xmm3, %xmm0
        pxor    %xmm0, %xmm2
        pshufd  $68, %xmm2, %xmm2
        psrldq  $5, %xmm2
        pand    %xmm4, %xmm2
        paddq   %xmm2, %xmm1
        movq    %xmm1, a
        pxor    %xmm3, %xmm1
        pshuflw $147, %xmm1, %xmm1
        pand    %xmm4, %xmm1
        movq    %xmm1, d
        paddq   %xmm1, %xmm0
        movq    %xmm0, c
        pxor    %xmm2, %xmm0
        pshufd  $20, %xmm0, %xmm0
        psrlq   $1, %xmm0
        pshufd  $136, %xmm0, %xmm0
        pand    %xmm4, %xmm0
        movq    %xmm0, b
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2023-06-30  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-features.cc (compute_convert_gain): Provide
        gains/costs for ROTATE and ROTATERT (by an integer constant).
        (general_scalar_chain::convert_rotate): New helper function to
        convert a DImode or SImode rotation by an integer constant into
        SSE vector form.
        (general_scalar_chain::convert_insn): Call the new convert_rotate
        for ROTATE and ROTATERT.
        (general_scalar_to_vector_candidate_p): Consider ROTATE and
        ROTATERT to be candidates if the second operand is an integer
        constant, valid for a rotation (or shift) in the given mode.
        * config/i386/i386-features.h (general_scalar_chain): Add new
        helper method convert_rotate.

gcc/testsuite/ChangeLog
        * gcc.target/i386/rotate-6.c: New test case.
        * gcc.target/i386/sse2-stv-1.c: Likewise.


Thanks in advance,
Roger
--

Comments

Uros Bizjak June 30, 2023, 8:27 a.m. UTC | #1
On Fri, Jun 30, 2023 at 9:29 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch implements scalar-to-vector (STV) support for DImode and SImode
> rotations by constant bit counts.  Scalar rotations are almost always
> optimal on x86, requiring only one or two instructions, but it is also
> possible to implement these efficiently with SSE2, requiring only one
> or two instructions for SImode rotations and at most 3 instructions for
> DImode rotations.  This allows GCC to STV rotations with a small or no
> penalty if there are other (net) benefits to converting a chain.  An
> example of the benefits is shown below, which is based upon the BLAKE2
> cryptographic hash function:
>
> unsigned long long a,b,c,d;
>
> unsigned long rot(unsigned long long x, int y)
> {
>   return (x<<y) | (x>>(64-y));
> }
>
> void foo()
> {
>   d = rot(d ^ a,32);
>   c = c + d;
>   b = rot(b ^ c,24);
>   a = a + b;
>   d = rot(d ^ a,16);
>   c = c + d;
>   b = rot(b ^ c,63);
> }
>
> where with -m32 -O2 -msse2
>
> Before (59 insns, 247 bytes):
> foo:    pushl   %edi
>         xorl    %edx, %edx
>         pushl   %esi
>         pushl   %ebx
>         subl    $16, %esp
>         movq    a, %xmm1
>         movq    d, %xmm0
>         movq    b, %xmm2
>         pxor    %xmm1, %xmm0
>         psrlq   $32, %xmm0
>         movd    %xmm0, %eax
>         movd    %edx, %xmm0
>         movd    %eax, %xmm3
>         punpckldq       %xmm0, %xmm3
>         movq    c, %xmm0
>         paddq   %xmm3, %xmm0
>         pxor    %xmm0, %xmm2
>         movd    %xmm2, %ecx
>         psrlq   $32, %xmm2
>         movd    %xmm2, %ebx
>         movl    %ecx, %eax
>         shldl   $24, %ebx, %ecx
>         shldl   $24, %eax, %ebx
>         movd    %ebx, %xmm4
>         movd    %ecx, %xmm2
>         punpckldq       %xmm4, %xmm2
>         movdqa  .LC0, %xmm4
>         pand    %xmm4, %xmm2
>         paddq   %xmm2, %xmm1
>         movq    %xmm1, a
>         pxor    %xmm3, %xmm1
>         movd    %xmm1, %esi
>         psrlq   $32, %xmm1
>         movd    %xmm1, %edi
>         movl    %esi, %eax
>         shldl   $16, %edi, %esi
>         shldl   $16, %eax, %edi
>         movd    %esi, %xmm1
>         movd    %edi, %xmm3
>         punpckldq       %xmm3, %xmm1
>         pand    %xmm4, %xmm1
>         movq    %xmm1, d
>         paddq   %xmm1, %xmm0
>         movq    %xmm0, c
>         pxor    %xmm2, %xmm0
>         movd    %xmm0, 8(%esp)
>         psrlq   $32, %xmm0
>         movl    8(%esp), %eax
>         movd    %xmm0, 12(%esp)
>         movl    12(%esp), %edx
>         shrdl   $1, %edx, %eax
>         xorl    %edx, %edx
>         movl    %eax, b
>         movl    %edx, b+4
>         addl    $16, %esp
>         popl    %ebx
>         popl    %esi
>         popl    %edi
>         ret
>
> After (32 insns, 165 bytes):
>         movq    a, %xmm1
>         xorl    %edx, %edx
>         movq    d, %xmm0
>         movq    b, %xmm2
>         movdqa  .LC0, %xmm4
>         pxor    %xmm1, %xmm0
>         psrlq   $32, %xmm0
>         movd    %xmm0, %eax
>         movd    %edx, %xmm0
>         movd    %eax, %xmm3
>         punpckldq       %xmm0, %xmm3
>         movq    c, %xmm0
>         paddq   %xmm3, %xmm0
>         pxor    %xmm0, %xmm2
>         pshufd  $68, %xmm2, %xmm2
>         psrldq  $5, %xmm2
>         pand    %xmm4, %xmm2
>         paddq   %xmm2, %xmm1
>         movq    %xmm1, a
>         pxor    %xmm3, %xmm1
>         pshuflw $147, %xmm1, %xmm1
>         pand    %xmm4, %xmm1
>         movq    %xmm1, d
>         paddq   %xmm1, %xmm0
>         movq    %xmm0, c
>         pxor    %xmm2, %xmm0
>         pshufd  $20, %xmm0, %xmm0
>         psrlq   $1, %xmm0
>         pshufd  $136, %xmm0, %xmm0
>         pand    %xmm4, %xmm0
>         movq    %xmm0, b
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
>
>
> 2023-06-30  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-features.cc (compute_convert_gain): Provide
>         gains/costs for ROTATE and ROTATERT (by an integer constant).
>         (general_scalar_chain::convert_rotate): New helper function to
>         convert a DImode or SImode rotation by an integer constant into
>         SSE vector form.
>         (general_scalar_chain::convert_insn): Call the new convert_rotate
>         for ROTATE and ROTATERT.
>         (general_scalar_to_vector_candidate_p): Consider ROTATE and
>         ROTATERT to be candidates if the second operand is an integer
>         constant, valid for a rotation (or shift) in the given mode.
>         * config/i386/i386-features.h (general_scalar_chain): Add new
>         helper method convert_rotate.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/rotate-6.c: New test case.
>         * gcc.target/i386/sse2-stv-1.c: Likewise.

LGTM.

Please note that AVX512VL provides VPROLD/VPROLQ and VPRORD/VPRORQ
native rotate instructions that can come handy here.

Thanks,
Uros.

>
>
> Thanks in advance,
> Roger
> --
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4a3b07a..b98baba 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -582,6 +582,25 @@  general_scalar_chain::compute_convert_gain ()
 	      igain -= vector_const_cost (XEXP (src, 0));
 	    break;
 
+	  case ROTATE:
+	  case ROTATERT:
+	    igain += m * ix86_cost->shift_const;
+	    if (smode == DImode)
+	      {
+		int bits = INTVAL (XEXP (src, 1));
+		if ((bits & 0x0f) == 0)
+		  igain -= ix86_cost->sse_op;
+		else if ((bits & 0x07) == 0)
+		  igain -= 2 * ix86_cost->sse_op;
+		else
+		  igain -= 3 * ix86_cost->sse_op;
+	      }
+	    else if (INTVAL (XEXP (src, 1)) == 16)
+	      igain -= ix86_cost->sse_op;
+	    else
+	      igain -= 2 * ix86_cost->sse_op;
+	    break;
+
 	  case AND:
 	  case IOR:
 	  case XOR:
@@ -1154,6 +1173,95 @@  scalar_chain::convert_insn_common (rtx_insn *insn)
 	}
 }
 
+/* Convert INSN which is an SImode or DImode rotation by a constant
+   to vector mode.  CODE is either ROTATE or ROTATERT with operands
+   OP0 and OP1.  Returns the SET_SRC of the last instruction in the
+   resulting sequence, which is emitted before INSN.  */
+
+rtx
+general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
+				      rtx_insn *insn)
+{
+  int bits = INTVAL (op1);
+  rtx pat, result;
+
+  convert_op (&op0, insn);
+  if (bits == 0)
+    return op0;
+
+  if (smode == DImode)
+    {
+      if (code == ROTATE)
+	bits = 64 - bits;
+      if (bits == 32)
+	{
+	  rtx tmp1 = gen_reg_rtx (V4SImode);
+	  pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+				 GEN_INT (225));
+	  emit_insn_before (pat, insn);
+	  result = gen_lowpart (V2DImode, tmp1);
+	}
+      else if (bits == 16 || bits == 48)
+	{
+	  rtx tmp1 = gen_reg_rtx (V8HImode);
+	  pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
+				  GEN_INT (bits == 16 ? 57 : 147));
+	  emit_insn_before (pat, insn);
+	  result = gen_lowpart (V2DImode, tmp1);
+	}
+      else if ((bits & 0x07) == 0)
+	{
+	  rtx tmp1 = gen_reg_rtx (V4SImode);
+	  pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+				 GEN_INT (68));
+	  emit_insn_before (pat, insn);
+	  rtx tmp2 = gen_reg_rtx (V1TImode);
+	  pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
+				    GEN_INT (bits));
+	  emit_insn_before (pat, insn);
+	  result = gen_lowpart (V2DImode, tmp2);
+	}
+      else
+	{
+	  rtx tmp1 = gen_reg_rtx (V4SImode);
+	  pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
+				 GEN_INT (20));
+	  emit_insn_before (pat, insn);
+	  rtx tmp2 = gen_reg_rtx (V2DImode);
+	  pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
+			       GEN_INT (bits & 31));
+	  emit_insn_before (pat, insn);
+	  rtx tmp3 = gen_reg_rtx (V4SImode);
+	  pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
+				 GEN_INT (bits > 32 ? 34 : 136));
+	  emit_insn_before (pat, insn);
+	  result = gen_lowpart (V2DImode, tmp3);
+	}
+    }
+  else if (bits == 16)
+    {
+      rtx tmp1 = gen_reg_rtx (V8HImode);
+      pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
+      emit_insn_before (pat, insn);
+      result = gen_lowpart (V4SImode, tmp1);
+    }
+  else
+    {
+      if (code == ROTATE)
+	bits = 32 - bits;
+
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
+			   GEN_INT (bits));
+      emit_insn_before (pat, insn);
+      result = gen_lowpart (V4SImode, tmp2);
+    }
+
+  return result;
+}
+
 /* Convert INSN to vector mode.  */
 
 void
@@ -1209,6 +1317,12 @@  general_scalar_chain::convert_insn (rtx_insn *insn)
       PUT_MODE (src, vmode);
       break;
 
+    case ROTATE:
+    case ROTATERT:
+      src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
+			    insn);
+      break;
+
     case NEG:
       src = XEXP (src, 0);
 
@@ -1982,6 +2096,8 @@  general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
 
     case ASHIFT:
     case LSHIFTRT:
+    case ROTATE:
+    case ROTATERT:
       if (!CONST_INT_P (XEXP (src, 1))
 	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
 	return false;
diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h
index 72a9f54..af5acbb 100644
--- a/gcc/config/i386/i386-features.h
+++ b/gcc/config/i386/i386-features.h
@@ -189,6 +189,7 @@  class general_scalar_chain : public scalar_chain
   void convert_insn (rtx_insn *insn) final override;
   void convert_op (rtx *op, rtx_insn *insn) final override;
   int vector_const_cost (rtx exp);
+  rtx convert_rotate (enum rtx_code, rtx op0, rtx op1, rtx_insn *insn);
 };
 
 class timode_scalar_chain : public scalar_chain
diff --git a/gcc/testsuite/gcc.target/i386/rotate-6.c b/gcc/testsuite/gcc.target/i386/rotate-6.c
new file mode 100644
index 0000000..42c2072
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/rotate-6.c
@@ -0,0 +1,195 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+/* scalar 64-bit DImode rotations.  */
+unsigned long long rot1(unsigned long long x) { return (x>>1) | (x<<63); }
+unsigned long long rot2(unsigned long long x) { return (x>>2) | (x<<62); }
+unsigned long long rot3(unsigned long long x) { return (x>>3) | (x<<61); }
+unsigned long long rot4(unsigned long long x) { return (x>>4) | (x<<60); }
+unsigned long long rot5(unsigned long long x) { return (x>>5) | (x<<59); }
+unsigned long long rot6(unsigned long long x) { return (x>>6) | (x<<58); }
+unsigned long long rot7(unsigned long long x) { return (x>>7) | (x<<57); }
+unsigned long long rot8(unsigned long long x) { return (x>>8) | (x<<56); }
+unsigned long long rot9(unsigned long long x) { return (x>>9) | (x<<55); }
+unsigned long long rot10(unsigned long long x) { return (x>>10) | (x<<54); }
+unsigned long long rot15(unsigned long long x) { return (x>>15) | (x<<49); }
+unsigned long long rot16(unsigned long long x) { return (x>>16) | (x<<48); }
+unsigned long long rot17(unsigned long long x) { return (x>>17) | (x<<47); }
+unsigned long long rot20(unsigned long long x) { return (x>>20) | (x<<44); }
+unsigned long long rot24(unsigned long long x) { return (x>>24) | (x<<40); }
+unsigned long long rot30(unsigned long long x) { return (x>>30) | (x<<34); }
+unsigned long long rot31(unsigned long long x) { return (x>>31) | (x<<33); }
+unsigned long long rot32(unsigned long long x) { return (x>>32) | (x<<32); }
+unsigned long long rot33(unsigned long long x) { return (x>>33) | (x<<31); }
+unsigned long long rot34(unsigned long long x) { return (x>>34) | (x<<30); }
+unsigned long long rot40(unsigned long long x) { return (x>>40) | (x<<24); }
+unsigned long long rot42(unsigned long long x) { return (x>>42) | (x<<22); }
+unsigned long long rot48(unsigned long long x) { return (x>>48) | (x<<16); }
+unsigned long long rot50(unsigned long long x) { return (x>>50) | (x<<14); }
+unsigned long long rot56(unsigned long long x) { return (x>>56) | (x<<8); }
+unsigned long long rot58(unsigned long long x) { return (x>>58) | (x<<6); }
+unsigned long long rot60(unsigned long long x) { return (x>>60) | (x<<4); }
+unsigned long long rot61(unsigned long long x) { return (x>>61) | (x<<3); }
+unsigned long long rot62(unsigned long long x) { return (x>>62) | (x<<2); }
+unsigned long long rot63(unsigned long long x) { return (x>>63) | (x<<1); }
+
+/* DImode mem-to-mem rotations. These STV with -m32.  */
+void mem1(unsigned long long *p) { *p = rot1(*p); }
+void mem2(unsigned long long *p) { *p = rot2(*p); }
+void mem3(unsigned long long *p) { *p = rot3(*p); }
+void mem4(unsigned long long *p) { *p = rot4(*p); }
+void mem5(unsigned long long *p) { *p = rot5(*p); }
+void mem6(unsigned long long *p) { *p = rot6(*p); }
+void mem7(unsigned long long *p) { *p = rot7(*p); }
+void mem8(unsigned long long *p) { *p = rot8(*p); }
+void mem9(unsigned long long *p) { *p = rot9(*p); }
+void mem10(unsigned long long *p) { *p = rot10(*p); }
+void mem15(unsigned long long *p) { *p = rot15(*p); }
+void mem16(unsigned long long *p) { *p = rot16(*p); }
+void mem17(unsigned long long *p) { *p = rot17(*p); }
+void mem20(unsigned long long *p) { *p = rot20(*p); }
+void mem24(unsigned long long *p) { *p = rot24(*p); }
+void mem30(unsigned long long *p) { *p = rot30(*p); }
+void mem31(unsigned long long *p) { *p = rot31(*p); }
+void mem32(unsigned long long *p) { *p = rot32(*p); }
+void mem33(unsigned long long *p) { *p = rot33(*p); }
+void mem34(unsigned long long *p) { *p = rot34(*p); }
+void mem40(unsigned long long *p) { *p = rot40(*p); }
+void mem42(unsigned long long *p) { *p = rot42(*p); }
+void mem48(unsigned long long *p) { *p = rot48(*p); }
+void mem50(unsigned long long *p) { *p = rot50(*p); }
+void mem56(unsigned long long *p) { *p = rot56(*p); }
+void mem58(unsigned long long *p) { *p = rot58(*p); }
+void mem60(unsigned long long *p) { *p = rot60(*p); }
+void mem61(unsigned long long *p) { *p = rot61(*p); }
+void mem62(unsigned long long *p) { *p = rot62(*p); }
+void mem63(unsigned long long *p) { *p = rot63(*p); }
+
+/* Check that rotN and memN give the same result.  */
+typedef unsigned long long (*rotN)(unsigned long long);
+typedef void (*memN)(unsigned long long*);
+
+void eval(rotN s, memN v, unsigned long long x)
+{
+  unsigned long long r = s(x);
+  unsigned long long t = x;
+  v(&t);
+
+  if (t != r)
+    __builtin_abort ();
+}
+
+void test(rotN s, memN v)
+{
+  eval(s,v,0x0000000000000000ll);
+  eval(s,v,0x0000000000000001ll);
+  eval(s,v,0x0000000000000002ll);
+  eval(s,v,0x0000000000000004ll);
+  eval(s,v,0x0000000000000008ll);
+  eval(s,v,0x0000000000000010ll);
+  eval(s,v,0x0000000000000020ll);
+  eval(s,v,0x0000000000000040ll);
+  eval(s,v,0x0000000000000080ll);
+  eval(s,v,0x0000000000000100ll);
+  eval(s,v,0x0000000000000200ll);
+  eval(s,v,0x0000000000000400ll);
+  eval(s,v,0x0000000000000800ll);
+  eval(s,v,0x0000000000001000ll);
+  eval(s,v,0x0000000000002000ll);
+  eval(s,v,0x0000000000004000ll);
+  eval(s,v,0x0000000000008000ll);
+  eval(s,v,0x0000000000010000ll);
+  eval(s,v,0x0000000000020000ll);
+  eval(s,v,0x0000000000040000ll);
+  eval(s,v,0x0000000000080000ll);
+  eval(s,v,0x0000000000100000ll);
+  eval(s,v,0x0000000000200000ll);
+  eval(s,v,0x0000000000400000ll);
+  eval(s,v,0x0000000000800000ll);
+  eval(s,v,0x0000000001000000ll);
+  eval(s,v,0x0000000002000000ll);
+  eval(s,v,0x0000000004000000ll);
+  eval(s,v,0x0000000008000000ll);
+  eval(s,v,0x0000000010000000ll);
+  eval(s,v,0x0000000020000000ll);
+  eval(s,v,0x0000000040000000ll);
+  eval(s,v,0x0000000080000000ll);
+  eval(s,v,0x0000000100000000ll);
+  eval(s,v,0x0000000200000000ll);
+  eval(s,v,0x0000000400000000ll);
+  eval(s,v,0x0000000800000000ll);
+  eval(s,v,0x0000001000000000ll);
+  eval(s,v,0x0000002000000000ll);
+  eval(s,v,0x0000004000000000ll);
+  eval(s,v,0x0000008000000000ll);
+  eval(s,v,0x0000010000000000ll);
+  eval(s,v,0x0000020000000000ll);
+  eval(s,v,0x0000040000000000ll);
+  eval(s,v,0x0000080000000000ll);
+  eval(s,v,0x0000100000000000ll);
+  eval(s,v,0x0000200000000000ll);
+  eval(s,v,0x0000400000000000ll);
+  eval(s,v,0x0000800000000000ll);
+  eval(s,v,0x0001000000000000ll);
+  eval(s,v,0x0002000000000000ll);
+  eval(s,v,0x0004000000000000ll);
+  eval(s,v,0x0008000000000000ll);
+  eval(s,v,0x0010000000000000ll);
+  eval(s,v,0x0020000000000000ll);
+  eval(s,v,0x0040000000000000ll);
+  eval(s,v,0x0080000000000000ll);
+  eval(s,v,0x0100000000000000ll);
+  eval(s,v,0x0200000000000000ll);
+  eval(s,v,0x0400000000000000ll);
+  eval(s,v,0x0800000000000000ll);
+  eval(s,v,0x1000000000000000ll);
+  eval(s,v,0x2000000000000000ll);
+  eval(s,v,0x4000000000000000ll);
+  eval(s,v,0x8000000000000000ll);
+  eval(s,v,0x0123456789abcdefll);
+  eval(s,v,0x1111111111111111ll);
+  eval(s,v,0x5555555555555555ll);
+  eval(s,v,0x8888888888888888ll);
+  eval(s,v,0xaaaaaaaaaaaaaaaall);
+  eval(s,v,0xcafebabecafebabell);
+  eval(s,v,0xdeadbeefdeadbeefll);
+  eval(s,v,0xfedcba9876543210ll);
+  eval(s,v,0xffffffffffffffffll);
+}
+
+int main()
+{
+  test(rot1,mem1);
+  test(rot2,mem2);
+  test(rot3,mem3);
+  test(rot4,mem4);
+  test(rot5,mem5);
+  test(rot6,mem6);
+  test(rot7,mem7);
+  test(rot8,mem8);
+  test(rot9,mem9);
+  test(rot10,mem10);
+  test(rot15,mem15);
+  test(rot16,mem16);
+  test(rot17,mem17);
+  test(rot20,mem20);
+  test(rot24,mem24);
+  test(rot30,mem30);
+  test(rot31,mem31);
+  test(rot32,mem32);
+  test(rot33,mem33);
+  test(rot34,mem34);
+  test(rot40,mem40);
+  test(rot42,mem42);
+  test(rot48,mem48);
+  test(rot50,mem50);
+  test(rot56,mem56);
+  test(rot58,mem58);
+  test(rot60,mem60);
+  test(rot61,mem61);
+  test(rot62,mem62);
+  test(rot63,mem63);
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-stv-1.c b/gcc/testsuite/gcc.target/i386/sse2-stv-1.c
new file mode 100644
index 0000000..a95d4ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-stv-1.c
@@ -0,0 +1,24 @@ 
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2" } */
+
+unsigned long long a,b,c,d;
+
+static unsigned long rot(unsigned long long x, int y)
+{
+  /* Only called with y in 1..63.  */
+  return (x<<y) | (x>>(64-y));
+}
+
+void foo()
+{
+    d = rot(d ^ a,32);
+    c = c + d;
+    b = rot(b ^ c,24);
+    a = a + b;
+    d = rot(d ^ a,16);
+    c = c + d;
+    b = rot(b ^ c,63);
+}
+
+/* { dg-final { scan-assembler-not "shldl" } } */
+/* { dg-final { scan-assembler-not "%\[er\]sp" } } */