diff mbox

[ARM] Implement widening vector moves and mults.

Message ID 1282655625.30429.72.camel@e102484-lin.cambridge.arm.com
State New
Headers show

Commit Message

Tejas Belagod Aug. 24, 2010, 1:13 p.m. UTC
Hi,

Attached is the new patch with the changes.

Thanks,
Tejas.

On Tue, 2010-08-24 at 10:14 +0100, Richard Earnshaw wrote:
> On Thu, 2010-08-19 at 15:14 +0100, Tejas Belagod wrote:
> > Hi,
> > 
> > Take 2 with the patch!
> > 
> > This patch implements support for vector widening signed and unsigned
> > moves and multiplications viz. VMOVL.<sign><size> and VMULL.<sign><size>
> > NEON instructions. This helps vectorize loops whose bodies have widening
> > moves or multiplications when compiled for NEON. This patch is
> > implemented to have support for vectorizing with and without
> > -mvectorize-with-neon-quad. 
> > 
> > Regression tested on trunk. OK for trunk?
> > 
> > --
> > Tejas Belagod
> > ARM.
> > 
> > gcc/testsuite
> > 
> > 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> > 
> > 	* lib/target-supports.exp (check_effective_target_vect_unpack):
> > 	Set vect_unpack supported flag to true for neon.
> > 
> > gcc/
> > 
> > 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> > 
> > 	* config/arm/iterators.md (VU, SE, V_widen_l): New. 
> > 	(V_unpack, US): New.
> > 	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Expansion for
> > 	vmovl.
> > 	(vec_unpack<US>_lo_<mode>): Likewise.
> > 	(neon_vec_unpack<US>_hi_<mode>): Instruction pattern for vmovl.
> > 	(neon_vec_unpack<US>_lo_<mode>): Likewise.
> > 	(vec_widen_<US>mult_lo_<mode>): Expansion for vmull.
> > 	(vec_widen_<US>mult_hi_<mode>): Likewise.
> > 	(neon_vec_<US>mult_lo_<mode>"): Instruction pattern for vmull.
> > 	(neon_vec_<US>mult_hi_<mode>"): Likewise.
> > 	(neon_unpack<US>_<mode>): Widening move intermediate step for
> > 	vectorizing without -mvectorize-with-neon-quad.
> > 	(neon_vec_<US>mult_<mode>): Widening multiply intermediate step
> > 	for vectorizing without -mvectorize-with-neon-quad.
> > 	* config/arm/predicates.md (vect_par_constant_high): Check for
> > 	high-half lanes of a vector.
> > 	(vect_par_constant_low): Check for low-half lanes of a vector.
> 
> +;; Assembler mnemonics for signedness of widening operations
> Full stop at end of sentence.
> 
> 
> +;; Predicates for parallel expanders based on mode.
> +(define_special_predicate "vect_par_constant_high" 
> +  (match_code "parallel")
> +{
> [...]
> +
> +  for (i = 0; i < count; i++)
> +   {
> +     rtx elt = XVECEXP (op, 0, i);
> +     int val = INTVAL (elt);
> 
> It's unlikely that this will ever fault, as the uses of this predicate
> are fairly limited, but good coding practice says that you shouldn't
> assume that.  So you need to confirm that elt is a const_int before
> extracting its value (and if it's not the predicate fails to match).
> Similarly for vect_par_constant_low.
> 
> Otherwise, this is OK.
> 
> R.
>

Comments

Richard Earnshaw Aug. 24, 2010, 2:14 p.m. UTC | #1
On Tue, 2010-08-24 at 14:13 +0100, Tejas Belagod wrote:
> Hi,
> 
> Attached is the new patch with the changes.
> 
> Thanks,
> Tejas.
> 

OK.

R.

> On Tue, 2010-08-24 at 10:14 +0100, Richard Earnshaw wrote:
> > On Thu, 2010-08-19 at 15:14 +0100, Tejas Belagod wrote:
> > > Hi,
> > > 
> > > Take 2 with the patch!
> > > 
> > > This patch implements support for vector widening signed and unsigned
> > > moves and multiplications viz. VMOVL.<sign><size> and VMULL.<sign><size>
> > > NEON instructions. This helps vectorize loops whose bodies have widening
> > > moves or multiplications when compiled for NEON. This patch is
> > > implemented to have support for vectorizing with and without
> > > -mvectorize-with-neon-quad. 
> > > 
> > > Regression tested on trunk. OK for trunk?
> > > 
> > > --
> > > Tejas Belagod
> > > ARM.
> > > 
> > > gcc/testsuite
> > > 
> > > 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> > > 
> > > 	* lib/target-supports.exp (check_effective_target_vect_unpack):
> > > 	Set vect_unpack supported flag to true for neon.
> > > 
> > > gcc/
> > > 
> > > 2010-08-19 Tejas Belagod <tejas.belagod@arm.com>
> > > 
> > > 	* config/arm/iterators.md (VU, SE, V_widen_l): New. 
> > > 	(V_unpack, US): New.
> > > 	* config/arm/neon.md (vec_unpack<US>_hi_<mode>): Expansion for
> > > 	vmovl.
> > > 	(vec_unpack<US>_lo_<mode>): Likewise.
> > > 	(neon_vec_unpack<US>_hi_<mode>): Instruction pattern for vmovl.
> > > 	(neon_vec_unpack<US>_lo_<mode>): Likewise.
> > > 	(vec_widen_<US>mult_lo_<mode>): Expansion for vmull.
> > > 	(vec_widen_<US>mult_hi_<mode>): Likewise.
> > > 	(neon_vec_<US>mult_lo_<mode>"): Instruction pattern for vmull.
> > > 	(neon_vec_<US>mult_hi_<mode>"): Likewise.
> > > 	(neon_unpack<US>_<mode>): Widening move intermediate step for
> > > 	vectorizing without -mvectorize-with-neon-quad.
> > > 	(neon_vec_<US>mult_<mode>): Widening multiply intermediate step
> > > 	for vectorizing without -mvectorize-with-neon-quad.
> > > 	* config/arm/predicates.md (vect_par_constant_high): Check for
> > > 	high-half lanes of a vector.
> > > 	(vect_par_constant_low): Check for low-half lanes of a vector.
> > 
> > +;; Assembler mnemonics for signedness of widening operations
> > Full stop at end of sentence.
> > 
> > 
> > +;; Predicates for parallel expanders based on mode.
> > +(define_special_predicate "vect_par_constant_high" 
> > +  (match_code "parallel")
> > +{
> > [...]
> > +
> > +  for (i = 0; i < count; i++)
> > +   {
> > +     rtx elt = XVECEXP (op, 0, i);
> > +     int val = INTVAL (elt);
> > 
> > It's unlikely that this will ever fault, as the uses of this predicate
> > are fairly limited, but good coding practice says that you shouldn't
> > assume that.  So you need to confirm that elt is a const_int before
> > extracting its value (and if it's not the predicate fails to match).
> > Similarly for vect_par_constant_low.
> > 
> > Otherwise, this is OK.
> > 
> > R.
> >
diff mbox

Patch

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index ee04aab..d9b5621 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -136,7 +136,9 @@ 
 ;; Modes with 32-bit elements only.
 (define_mode_iterator V32 [V2SI V2SF V4SI V4SF])
 
-
+;; Modes with 8-bit, 16-bit and 32-bit elements.
+(define_mode_iterator VU [V16QI V8HI V4SI])
+ 
 ;;----------------------------------------------------------------------------
 ;; Code iterators
 ;;----------------------------------------------------------------------------
@@ -156,6 +158,8 @@ 
 ;; without unsigned variants (for use with *SFmode pattern).
 (define_code_iterator vqhs_ops [plus smin smax])
 
+;; A list of widening operators
+(define_code_iterator SE [sign_extend zero_extend])
 
 ;;----------------------------------------------------------------------------
 ;; Mode attributes
@@ -360,6 +364,11 @@ 
                                  (V2SF "2") (V4SF "4")
                                  (DI "1")   (V2DI "2")])
 
+;; Same as V_widen, but lower-case.
+(define_mode_attr V_widen_l [(V8QI "v8hi") (V4HI "v4si") ( V2SI "v2di")])
+
+;; Widen. Result is half the number of elements, but widened to double-width.
+(define_mode_attr V_unpack   [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")])
 
 ;;----------------------------------------------------------------------------
 ;; Code attributes
@@ -375,3 +384,6 @@ 
 
 (define_code_attr cnb [(ltu "CC_C") (geu "CC")])
 (define_code_attr optab [(ltu "ltu") (geu "geu")])
+
+;; Assembler mnemonics for signedness of widening operations.
+(define_code_attr US [(sign_extend "s") (zero_extend "u")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index bdc279a..96241b9 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -4977,3 +4977,205 @@ 
   emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
   DONE;
 })
+
+(define_insn "neon_vec_unpack<US>_lo_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+        (SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 2 "vect_par_constant_low" ""))))]
+  "TARGET_NEON"
+  "vmovl.<US><V_sz_elem> %q0, %e1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vec_unpack<US>_hi_<mode>"
+  [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+        (SE:<V_unpack> (vec_select:<V_HALF>
+			  (match_operand:VU 1 "register_operand" "w")
+			  (match_operand:VU 2 "vect_par_constant_high" ""))))]
+  "TARGET_NEON"
+  "vmovl.<US><V_sz_elem> %q0, %f1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_unpack<US>_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand"))]
+ "TARGET_NEON"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2); i++)
+     RTVEC_ELT (v, i) = GEN_INT ((<V_mode_nunits>/2) + i);
+  
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_hi_<mode> (operands[0], 
+                                                 operands[1], 
+					         t1));
+   DONE;
+  }
+)
+
+(define_expand "vec_unpack<US>_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))]
+ "TARGET_NEON"
+  {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+   emit_insn (gen_neon_vec_unpack<US>_lo_<mode> (operands[0], 
+                                                 operands[1], 
+				   	         t1));
+   DONE;
+  }
+)
+
+(define_insn "neon_vec_<US>mult_lo_<mode>"
+ [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+       (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			   (match_operand:VU 1 "register_operand" "w") 
+                           (match_operand:VU 2 "vect_par_constant_low" "")))
+ 		        (SE:<V_unpack> (vec_select:<V_HALF>
+                           (match_operand:VU 3 "register_operand" "w") 
+                           (match_dup 2)))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %e1, %e3"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_lo_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+   emit_insn (gen_neon_vec_<US>mult_lo_<mode> (operands[0],
+ 					       operands[1],
+					       t1,
+					       operands[2]));
+   DONE;
+ }
+)
+
+(define_insn "neon_vec_<US>mult_hi_<mode>"
+ [(set (match_operand:<V_unpack> 0 "register_operand" "=w")
+      (mult:<V_unpack> (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 1 "register_operand" "w") 
+			    (match_operand:VU 2 "vect_par_constant_high" "")))
+		       (SE:<V_unpack> (vec_select:<V_HALF>
+			    (match_operand:VU 3 "register_operand" "w") 
+			    (match_dup 2)))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %f1, %f3"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_hi_<mode>"
+  [(match_operand:<V_unpack> 0 "register_operand" "")
+   (SE:<V_unpack> (match_operand:VU 1 "register_operand" ""))
+   (SE:<V_unpack> (match_operand:VU 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtvec v = rtvec_alloc (<V_mode_nunits>/2)  ;
+   rtx t1;
+   int i;
+   for (i = 0; i < (<V_mode_nunits>/2) ; i++)
+     RTVEC_ELT (v, i) = GEN_INT (<V_mode_nunits>/2 + i);
+   t1 = gen_rtx_PARALLEL (<MODE>mode, v);
+
+   emit_insn (gen_neon_vec_<US>mult_hi_<mode> (operands[0],
+ 					       operands[1],
+					       t1,
+					       operands[2]));
+   DONE;
+
+ }
+)
+
+;; Vectorize for non-neon-quad case
+(define_insn "neon_unpack<US>_<mode>"
+ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
+       (SE:<V_widen> (match_operand:VDI 1 "register_operand" "")))]
+ "TARGET_NEON"
+ "vmovl.<US><V_sz_elem> %q0, %1"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_unpack<US>_lo_<mode>"
+ [(match_operand:<V_double_width> 0 "register_operand" "")
+  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
+ "TARGET_NEON"
+{
+  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
+  emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
+
+  DONE;
+}
+)
+
+(define_expand "vec_unpack<US>_hi_<mode>"
+ [(match_operand:<V_double_width> 0 "register_operand" "")
+  (SE:<V_double_width>(match_operand:VDI 1 "register_operand"))]
+ "TARGET_NEON"
+{
+  rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+  emit_insn (gen_neon_unpack<US>_<mode> (tmpreg, operands[1]));
+  emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
+
+  DONE;
+}
+)
+
+(define_insn "neon_vec_<US>mult_<mode>"
+ [(set (match_operand:<V_widen> 0 "register_operand" "=w")
+       (mult:<V_widen> (SE:<V_widen> 
+		 	   (match_operand:VDI 1 "register_operand" "w"))
+ 		       (SE:<V_widen> 
+			   (match_operand:VDI 2 "register_operand" "w"))))]
+  "TARGET_NEON"
+  "vmull.<US><V_sz_elem> %q0, %1, %2"
+  [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_expand "vec_widen_<US>mult_hi_<mode>"
+  [(match_operand:<V_double_width> 0 "register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
+   emit_insn (gen_neon_vget_high<V_widen_l> (operands[0], tmpreg));
+ 					    
+   DONE;
+
+ }
+)
+
+(define_expand "vec_widen_<US>mult_lo_<mode>"
+  [(match_operand:<V_double_width> 0 "register_operand" "")
+   (SE:<V_double_width> (match_operand:VDI 1 "register_operand" ""))
+   (SE:<V_double_width> (match_operand:VDI 2 "register_operand" ""))]
+ "TARGET_NEON"
+ {
+   rtx tmpreg = gen_reg_rtx (<V_widen>mode);
+   emit_insn (gen_neon_vec_<US>mult_<mode> (tmpreg, operands[1], operands[2]));
+   emit_insn (gen_neon_vget_low<V_widen_l> (operands[0], tmpreg));
+ 					    
+   DONE;
+
+ }
+)
diff --git a/gcc/config/arm/predicates.md b/gcc/config/arm/predicates.md
index da3b6dc..032b2ec 100644
--- a/gcc/config/arm/predicates.md
+++ b/gcc/config/arm/predicates.md
@@ -619,3 +619,61 @@ 
 		(and (match_test "TARGET_32BIT")
 		     (match_operand 0 "arm_di_operand"))))
 
+;; Predicates for parallel expanders based on mode.
+(define_special_predicate "vect_par_constant_high" 
+  (match_code "parallel")
+{
+  HOST_WIDE_INT count = XVECLEN (op, 0);
+  int i;
+  int base = GET_MODE_NUNITS (mode);
+
+  if ((count < 1)
+      || (count != base/2))
+    return false;
+    
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  for (i = 0; i < count; i++)
+   {
+     rtx elt = XVECEXP (op, 0, i);
+     int val;
+
+     if (GET_CODE (elt) != CONST_INT)
+       return false;
+
+     val = INTVAL (elt);
+     if (val != (base/2) + i)
+       return false;
+   }
+  return true; 
+})
+
+(define_special_predicate "vect_par_constant_low"
+  (match_code "parallel")
+{
+  HOST_WIDE_INT count = XVECLEN (op, 0);
+  int i;
+  int base = GET_MODE_NUNITS (mode);
+
+  if ((count < 1)
+      || (count != base/2))
+    return false;
+    
+  if (!VECTOR_MODE_P (mode))
+    return false;
+
+  for (i = 0; i < count; i++)
+   {
+     rtx elt = XVECEXP (op, 0, i);
+     int val;
+
+     if (GET_CODE (elt) != CONST_INT)
+       return false;
+
+     val = INTVAL (elt);
+     if (val != i)
+       return false;
+   } 
+  return true; 
+})
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 1682d58..4b95323 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2640,7 +2640,8 @@  proc check_effective_target_vect_unpack { } {
         if { ([istarget powerpc*-*-*] && ![istarget powerpc-*paired*])
              || [istarget i?86-*-*]
              || [istarget x86_64-*-*] 
-             || [istarget spu-*-*] } {
+             || [istarget spu-*-*]
+             || ([istarget arm*-*-*] && [check_effective_target_arm_neon]) } {
             set et_vect_unpack_saved 1
         }
     }