diff mbox

[2/3] i386: Rewrite ix86_expand_vshuffle.

Message ID 1317836852-16178-3-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Oct. 5, 2011, 5:47 p.m. UTC
1: Handle TARGET_XOP.
2: Reduce code duplication.
3: Use ASHIFT instead of MULT for scaling.
4: Fix errors in building convert-to-v16qi indicies.
5: Handle v2di without sse4.1.
---
 gcc/ChangeLog                 |    6 +
 gcc/config/i386/i386-protos.h |    2 +-
 gcc/config/i386/i386.c        |  208 ++++++++++++++++++++---------------------
 gcc/config/i386/sse.md        |    4 +-
 4 files changed, 109 insertions(+), 111 deletions(-)
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index a88854d..4b5816d 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -6,6 +6,12 @@ 
 	code duplication.  Do update_stmt here ...
 	(expand_vector_operations_1): ... not here.
 
+	* config/i386/i386.c (ix86_expand_vshuffle): Never fail.  Handle
+	TARGET_XOP.  Fix pshufb constant vector creation.  Reduce code
+	duplication.  Handle V2DI without SSE4.1.
+	* config/i386/i386-protos.h (ix86_expand_vshuffle): Update decl.
+	* config/i386/i386.md (vshuffle<V_128>): Remove assert for ok.
+
 2011-10-05  DJ Delorie  <dj@redhat.com>
 	    Nick Clifton  <nickc@redhat.com>
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 99327ed..0bbfa9b 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -123,7 +123,7 @@  extern bool ix86_expand_int_movcc (rtx[]);
 extern bool ix86_expand_fp_movcc (rtx[]);
 extern bool ix86_expand_fp_vcond (rtx[]);
 extern bool ix86_expand_int_vcond (rtx[]);
-extern bool ix86_expand_vshuffle (rtx[]);
+extern void ix86_expand_vshuffle (rtx[]);
 extern void ix86_expand_sse_unpack (rtx[], bool, bool);
 extern bool ix86_expand_int_addcc (rtx[]);
 extern rtx ix86_expand_call (rtx, rtx, rtx, rtx, rtx, bool);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 4c1db3a..80a9e73 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -19236,145 +19236,139 @@  ix86_expand_int_vcond (rtx operands[])
   return true;
 }
 
-bool
+void
 ix86_expand_vshuffle (rtx operands[])
 {
   rtx target = operands[0];
   rtx op0 = operands[1];
   rtx op1 = operands[2];
   rtx mask = operands[3];
-  rtx new_mask, vt, t1, t2, w_vector;
+  rtx vt, vec[16];
   enum machine_mode mode = GET_MODE (op0);
   enum machine_mode maskmode = GET_MODE (mask);
-  enum machine_mode maskinner = GET_MODE_INNER (mode);
-  rtx vec[16];
-  int w, i, j;
-  bool one_operand_shuffle = op0 == op1;
+  int w, e, i;
+  bool one_operand_shuffle = rtx_equal_p (op0, op1);
 
-  gcc_assert ((TARGET_SSSE3 || TARGET_AVX) && GET_MODE_BITSIZE (mode) == 128);
+  gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128);
 
   /* Number of elements in the vector.  */
-  w = GET_MODE_BITSIZE (maskmode) / GET_MODE_BITSIZE (maskinner);
-
-  /* generate w_vector = {w, w, ...}  */
-  for (i = 0; i < w; i++)
-    vec[i] = GEN_INT (w);
-  w_vector = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
-
-  /* mask = mask & {w-1, w-1, w-1,...} */
-  for (i = 0; i < w; i++)
-    vec[i] = GEN_INT (w - 1);
+  w = GET_MODE_NUNITS (mode);
+  e = GET_MODE_UNIT_SIZE (mode);
 
-  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
-  new_mask = expand_simple_binop (maskmode, AND, mask, vt,
-				  NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* If the original vector mode is V16QImode, we can just
-     use pshufb directly.  */
-  if (mode == V16QImode && one_operand_shuffle)
+  if (TARGET_XOP)
     {
-      t1 = gen_reg_rtx (V16QImode);
-      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
-      emit_insn (gen_rtx_SET (VOIDmode, target, t1));
-      return true;
+      /* The XOP VPPERM insn supports three inputs.  By ignoring the 
+	 one_operand_shuffle special case, we avoid creating another
+	 set of constant vectors in memory.  */
+      one_operand_shuffle = false;
+
+      /* mask = mask & {2*w-1, ...} */
+      vt = GEN_INT (2*w - 1);
     }
-  else if (mode == V16QImode)
+  else
     {
-      rtx xops[6];
-
-      t1 = gen_reg_rtx (V16QImode);
-      t2 = gen_reg_rtx (V16QImode);
-      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
-      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, new_mask));
-
-      /* mask = mask & {w, w, ...}  */
-      mask = expand_simple_binop (V16QImode, AND, mask, w_vector,
-				  NULL_RTX, 0, OPTAB_DIRECT);
-      xops[0] = target;
-      xops[1] = operands[1];
-      xops[2] = operands[2];
-      xops[3] = gen_rtx_EQ (mode, mask, w_vector);
-      xops[4] = t1;
-      xops[5] = t2;
-
-      return ix86_expand_int_vcond (xops);
+      /* mask = mask & {w-1, ...} */
+      vt = GEN_INT (w - 1);
     }
 
-  /* mask = mask * {w, w, ...}  */
-  new_mask = expand_simple_binop (maskmode, MULT, new_mask, w_vector,
-				  NULL_RTX, 0, OPTAB_DIRECT);
-
-  /* Convert mask to vector of chars.  */
-  new_mask = simplify_gen_subreg (V16QImode, new_mask, maskmode, 0);
-  new_mask = force_reg (V16QImode, new_mask);
-
-  /* Build a helper mask wich we will use in pshufb
-     (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
-     (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}
-     ...  */
-  for (i = 0; i < w; i++)
-    for (j = 0; j < 16/w; j++)
-      vec[i*w+j] = GEN_INT (i*16/w);
-  vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
-  vt = force_reg (V16QImode, vt);
-
-  t1 = gen_reg_rtx (V16QImode);
-  emit_insn (gen_ssse3_pshufbv16qi3 (t1, new_mask, vt));
-  new_mask = t1;
-
-  /* Convert it into the byte positions by doing
-     new_mask = new_mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
   for (i = 0; i < w; i++)
-    for (j = 0; j < 16/w; j++)
-      vec[i*w+j] = GEN_INT (j);
+    vec[i] = vt;
+  vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+  mask = expand_simple_binop (maskmode, AND, mask, vt,
+			      NULL_RTX, 0, OPTAB_DIRECT);
 
-  vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
-  new_mask = expand_simple_binop (V16QImode, PLUS, new_mask, vt,
+  /* For non-QImode operations, convert the word permutation control
+     into a byte permutation control.  */
+  if (mode != V16QImode)
+    {
+      mask = expand_simple_binop (maskmode, ASHIFT, mask,
+				  GEN_INT (exact_log2 (e)),
 				  NULL_RTX, 0, OPTAB_DIRECT);
 
-  t1 = gen_reg_rtx (V16QImode);
+      /* Convert mask to vector of chars.  */
+      mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
+
+      /* Replicate each of the input bytes into byte positions:
+	 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
+	 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
+	 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}.  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i/e * e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = force_const_mem (V16QImode, vt);
+      if (TARGET_XOP)
+	emit_insn (gen_xop_pperm (mask, mask, mask, vt));
+      else
+	emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
 
-  /* Convert OP0 to vector of chars.  */
-  op0 = simplify_gen_subreg (V16QImode, op0, mode, 0);
-  op0 = force_reg (V16QImode, op0);
-  emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, new_mask));
+      /* Convert it into the byte positions by doing
+	 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...}  */
+      for (i = 0; i < 16; ++i)
+	vec[i] = GEN_INT (i % e);
+      vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
+      vt = force_const_mem (V16QImode, vt);
+      emit_insn (gen_addv16qi3 (mask, mask, vt));
+    }
 
-  if (one_operand_shuffle)
+  /* The actual shuffle operations all operate on V16QImode.  */
+  op0 = gen_lowpart (V16QImode, op0);
+  op1 = gen_lowpart (V16QImode, op1);
+  target = gen_lowpart (V16QImode, target);
+
+  if (TARGET_XOP)
     {
-      /* Convert it back from vector of chars to the original mode.  */
-      t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
-      emit_insn (gen_rtx_SET (VOIDmode, target, t1));
-      return true;
+      emit_insn (gen_xop_pperm (target, op0, op1, mask));
+    }
+  else if (one_operand_shuffle)
+    {
+      emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
     }
   else
     {
-      rtx xops[6];
+      rtx xops[6], t1, t2;
+      bool ok;
 
+      /* Shuffle the two input vectors independently.  */
+      t1 = gen_reg_rtx (V16QImode);
       t2 = gen_reg_rtx (V16QImode);
-
-      /* Convert OP1 to vector of chars.  */
-      op1 = simplify_gen_subreg (V16QImode, op1, mode, 0);
-      op1 = force_reg (V16QImode, op1);
-      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op1, new_mask));
-
-      /* mask = mask & {w, w, ...}  */
-      mask = expand_simple_binop (V16QImode, AND, mask, w_vector,
+      emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
+      emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
+
+      /* Then merge them together.  The key is whether any given control
+         element contained a bit set that indicates the second word.  */
+      mask = operands[3];
+      vt = GEN_INT (w);
+      if (maskmode == V2DImode && !TARGET_SSE4_1)
+	{
+	  /* Without SSE4.1, we don't have V2DImode EQ.  Perform one
+	     more shuffle to convert the V2DI input mask into a V4SI
+	     input mask.  At which point the masking that expand_int_vcond
+	     will work as desired.  */
+	  rtx t3 = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
+				        const0_rtx, const0_rtx,
+				        const2_rtx, const2_rtx));
+	  mask = t3;
+	  maskmode = V4SImode;
+	  e = w = 4;
+	}
+
+      for (i = 0; i < w; i++)
+	vec[i] = vt;
+      vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+      vt = force_reg (maskmode, vt);
+      mask = expand_simple_binop (maskmode, AND, mask, vt,
 				  NULL_RTX, 0, OPTAB_DIRECT);
 
-      t1 = simplify_gen_subreg (mode, t1, V16QImode, 0);
-      t2 = simplify_gen_subreg (mode, t2, V16QImode, 0);
-
-      xops[0] = target;
-      xops[1] = operands[1];
-      xops[2] = operands[2];
-      xops[3] = gen_rtx_EQ (mode, mask, w_vector);
-      xops[4] = t1;
-      xops[5] = t2;
-
-      return ix86_expand_int_vcond (xops);
+      xops[0] = gen_lowpart (maskmode, operands[0]);
+      xops[1] = gen_lowpart (maskmode, t2);
+      xops[2] = gen_lowpart (maskmode, t1);
+      xops[3] = gen_rtx_EQ (maskmode, mask, vt);
+      xops[4] = mask;
+      xops[5] = vt;
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
     }
-
-  return false;
 }
 
 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 251cdde..ee9cf0b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6229,12 +6229,10 @@ 
    (match_operand:<sseshuffint> 3 "register_operand" "")]
   "TARGET_SSSE3 || TARGET_AVX"
 {
-  bool ok = ix86_expand_vshuffle (operands);
-  gcc_assert (ok);
+  ix86_expand_vshuffle (operands);
   DONE;
 })
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel bitwise logical operations