diff mbox series

[RFC,PR117093] match.pd: Fold vec_perm with view_convert

Message ID 794D9463-93D4-4EDC-AA5B-499A5557B0EF@nvidia.com
State New
Headers show
Series [RFC,PR117093] match.pd: Fold vec_perm with view_convert | expand

Commit Message

Jennifer Schmitz Nov. 5, 2024, 5:10 p.m. UTC
We are working on a patch to improve the codegen for the following test case:
uint64x2_t foo (uint64x2_t r) {
    uint32x4_t a = vreinterpretq_u32_u64 (r);
    uint32_t t;
    t = a[0]; a[0] = a[1]; a[1] = t;
    t = a[2]; a[2] = a[3]; a[3] = t;
    return vreinterpretq_u64_u32 (a);
}
that GCC currently compiles to (-O1):
foo:
        mov     v31.16b, v0.16b
        ins     v0.s[0], v0.s[1]
        ins     v0.s[1], v31.s[0]
        ins     v0.s[2], v31.s[3]
        ins     v0.s[3], v31.s[2]
        ret
whereas LLVM produces the preferable sequence
foo:
	rev64   v0.4s, v0.4s
        ret

On gimple level, we currently have:
  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
  t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
  a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
  a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
  t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
  _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
  a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
  a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
  _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
  return _10;

whereas the desired sequence is:
  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
  a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
  _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
  return _4;

If we remove the casts from the test case, the forwprop1 dump shows that
a series of match.pd is applied (repeatedly, only showing the first
iteration here):
Applying pattern match.pd:10881, gimple-match-1.cc:25213
Applying pattern match.pd:11099, gimple-match-1.cc:25714
Applying pattern match.pd:9549, gimple-match-1.cc:24274
gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;

The reason why these patterns cannot be applied with casts seems to be
the failing types_match (@0, @1) in the following pattern:
/* Simplify vector inserts of other vector extracts to a permute.  */
(simplify
 (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
 (if (VECTOR_TYPE_P (type)
      && (VECTOR_MODE_P (TYPE_MODE (type))
          || optimize_vectors_before_lowering_p ())
      && types_match (@0, @1)
      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
      && multiple_p (wi::to_poly_offset (@rpos),
                     wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
  (with
   {
     [...]
   }
   (if (!VECTOR_MODE_P (TYPE_MODE (type))
        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
    (vec_perm @0 @1 { vec_perm_indices_to_tree
                        (build_vector_type (ssizetype, nunits), sel); })))))

The types_match fails, because the following pattern has already removed the
view_convert expression, thereby changing the type of @0:
(simplify
 (BIT_FIELD_REF (view_convert @0) @1 @2)
  [...]
  (BIT_FIELD_REF @0 @1 @2)))

One attempt to make the types_match true was to add a single_use flag to
the view_convert expression in the pattern above, preventing it from
being applied.
While this actually fixed the test case and produced the intended
instruction sequence, it caused another test to fail that relies on application
of the pattern with multiple use of the view_convert expression
(gcc.target/i386/vect-strided-3.c).

Hence, the RFC: How can we make the types_match work with view_convert
expressions in the arguments?

---
 gcc/match.pd                             |  7 ++++---
 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
 2 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c

Comments

Richard Biener Nov. 7, 2024, 12:47 p.m. UTC | #1
On Tue, 5 Nov 2024, Jennifer Schmitz wrote:

> We are working on a patch to improve the codegen for the following test case:
> uint64x2_t foo (uint64x2_t r) {
>     uint32x4_t a = vreinterpretq_u32_u64 (r);
>     uint32_t t;
>     t = a[0]; a[0] = a[1]; a[1] = t;
>     t = a[2]; a[2] = a[3]; a[3] = t;
>     return vreinterpretq_u64_u32 (a);
> }
> that GCC currently compiles to (-O1):
> foo:
>         mov     v31.16b, v0.16b
>         ins     v0.s[0], v0.s[1]
>         ins     v0.s[1], v31.s[0]
>         ins     v0.s[2], v31.s[3]
>         ins     v0.s[3], v31.s[2]
>         ret
> whereas LLVM produces the preferable sequence
> foo:
> 	rev64   v0.4s, v0.4s
>         ret
> 
> On gimple level, we currently have:
>   _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
>   t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
>   a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
>   a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
>   t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
>   _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
>   a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
>   a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
>   _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
>   return _10;
> 
> whereas the desired sequence is:
>   _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
>   a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
>   _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
>   return _4;
> 
> If we remove the casts from the test case, the forwprop1 dump shows that
> a series of match.pd is applied (repeatedly, only showing the first
> iteration here):
> Applying pattern match.pd:10881, gimple-match-1.cc:25213
> Applying pattern match.pd:11099, gimple-match-1.cc:25714
> Applying pattern match.pd:9549, gimple-match-1.cc:24274
> gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;
> 
> The reason why these patterns cannot be applied with casts seems to be
> the failing types_match (@0, @1) in the following pattern:
> /* Simplify vector inserts of other vector extracts to a permute.  */
> (simplify
>  (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
>  (if (VECTOR_TYPE_P (type)
>       && (VECTOR_MODE_P (TYPE_MODE (type))
>           || optimize_vectors_before_lowering_p ())
>       && types_match (@0, @1)
>       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>       && multiple_p (wi::to_poly_offset (@rpos),
>                      wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
>   (with
>    {
>      [...]
>    }
>    (if (!VECTOR_MODE_P (TYPE_MODE (type))
>         || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
>     (vec_perm @0 @1 { vec_perm_indices_to_tree
>                         (build_vector_type (ssizetype, nunits), sel); })))))
> 
> The types_match fails, because the following pattern has already removed the
> view_convert expression, thereby changing the type of @0:
> (simplify
>  (BIT_FIELD_REF (view_convert @0) @1 @2)
>   [...]
>   (BIT_FIELD_REF @0 @1 @2)))
> 
> One attempt to make the types_match true was to add a single_use flag to
> the view_convert expression in the pattern above, preventing it from
> being applied.
> While this actually fixed the test case and produced the intended
> instruction sequence, it caused another test to fail that relies on application
> of the pattern with multiple use of the view_convert expression
> (gcc.target/i386/vect-strided-3.c).
> 
> Hence, the RFC: How can we make the types_match work with view_convert
> expressions in the arguments?

You could remove the types_match (@0, @1) with

diff --git a/gcc/match.pd b/gcc/match.pd
index 00988241348..820a589b577 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9539,7 +9539,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (VECTOR_TYPE_P (type)
       && (VECTOR_MODE_P (TYPE_MODE (type))
          || optimize_vectors_before_lowering_p ())
-      && types_match (@0, @1)
+      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
+                         TYPE_SIZE (TREE_TYPE (@1)), 0)
       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
       && multiple_p (wi::to_poly_offset (@rpos),
@@ -9547,7 +9548,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (with
    {
      unsigned HOST_WIDE_INT elsz
-       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
+       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
@@ -9559,7 +9560,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
    }
    (if (!VECTOR_MODE_P (TYPE_MODE (type))
        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
false))
-    (vec_perm @0 @1 { vec_perm_indices_to_tree
+    (vec_perm @0 (view_convert @1) { vec_perm_indices_to_tree
                         (build_vector_type (ssizetype, nunits), sel); 
})))))
 
 (if (canonicalize_math_after_vectorization_p ())

or alternatively avoid the BIT_FIELD_REF (view_convert @) transform
iff the original ref type-wise matches a vector element extract
and the result with the view_convert does not.

Richard.

> 
> ---
>  gcc/match.pd                             |  7 ++++---
>  gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
>  2 files changed, 21 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 9107e6a95ca..d7957177027 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9357,9 +9357,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
>  
>  (simplify
> - (BIT_FIELD_REF (view_convert @0) @1 @2)
> - (if (! INTEGRAL_TYPE_P (TREE_TYPE (@0))
> -      || type_has_mode_precision_p (TREE_TYPE (@0)))
> + (BIT_FIELD_REF (view_convert@3 @0) @1 @2)
> + (if ((! INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +       || type_has_mode_precision_p (TREE_TYPE (@0)))
> +      && single_use (@3))
>    (BIT_FIELD_REF @0 @1 @2)))
>  
>  (simplify
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> new file mode 100644
> index 00000000000..0fea32919dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> @@ -0,0 +1,17 @@
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foo:
> +**	rev64	v0\.4s, v0\.4s
> +**	ret
> +*/
> +uint64x2_t foo (uint64x2_t r) {
> +    uint32x4_t a = vreinterpretq_u32_u64 (r);
> +    uint32_t t;
> +    t = a[0]; a[0] = a[1]; a[1] = t;
> +    t = a[2]; a[2] = a[3]; a[3] = t;
> +    return vreinterpretq_u64_u32 (a);
> +}
>
Jennifer Schmitz Nov. 15, 2024, 8:19 a.m. UTC | #2
> On 7 Nov 2024, at 13:47, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Tue, 5 Nov 2024, Jennifer Schmitz wrote:
> 
>> We are working on a patch to improve the codegen for the following test case:
>> uint64x2_t foo (uint64x2_t r) {
>>    uint32x4_t a = vreinterpretq_u32_u64 (r);
>>    uint32_t t;
>>    t = a[0]; a[0] = a[1]; a[1] = t;
>>    t = a[2]; a[2] = a[3]; a[3] = t;
>>    return vreinterpretq_u64_u32 (a);
>> }
>> that GCC currently compiles to (-O1):
>> foo:
>>        mov     v31.16b, v0.16b
>>        ins     v0.s[0], v0.s[1]
>>        ins     v0.s[1], v31.s[0]
>>        ins     v0.s[2], v31.s[3]
>>        ins     v0.s[3], v31.s[2]
>>        ret
>> whereas LLVM produces the preferable sequence
>> foo:
>>      rev64   v0.4s, v0.4s
>>        ret
>> 
>> On gimple level, we currently have:
>>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
>>  t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
>>  a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
>>  a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
>>  t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
>>  _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
>>  a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
>>  a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
>>  _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
>>  return _10;
>> 
>> whereas the desired sequence is:
>>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
>>  a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
>>  _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
>>  return _4;
>> 
>> If we remove the casts from the test case, the forwprop1 dump shows that
>> a series of match.pd is applied (repeatedly, only showing the first
>> iteration here):
>> Applying pattern match.pd:10881, gimple-match-1.cc:25213
>> Applying pattern match.pd:11099, gimple-match-1.cc:25714
>> Applying pattern match.pd:9549, gimple-match-1.cc:24274
>> gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;
>> 
>> The reason why these patterns cannot be applied with casts seems to be
>> the failing types_match (@0, @1) in the following pattern:
>> /* Simplify vector inserts of other vector extracts to a permute.  */
>> (simplify
>> (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
>> (if (VECTOR_TYPE_P (type)
>>      && (VECTOR_MODE_P (TYPE_MODE (type))
>>          || optimize_vectors_before_lowering_p ())
>>      && types_match (@0, @1)
>>      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>>      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>>      && multiple_p (wi::to_poly_offset (@rpos),
>>                     wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
>>  (with
>>   {
>>     [...]
>>   }
>>   (if (!VECTOR_MODE_P (TYPE_MODE (type))
>>        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
>>    (vec_perm @0 @1 { vec_perm_indices_to_tree
>>                        (build_vector_type (ssizetype, nunits), sel); })))))
>> 
>> The types_match fails, because the following pattern has already removed the
>> view_convert expression, thereby changing the type of @0:
>> (simplify
>> (BIT_FIELD_REF (view_convert @0) @1 @2)
>>  [...]
>>  (BIT_FIELD_REF @0 @1 @2)))
>> 
>> One attempt to make the types_match true was to add a single_use flag to
>> the view_convert expression in the pattern above, preventing it from
>> being applied.
>> While this actually fixed the test case and produced the intended
>> instruction sequence, it caused another test to fail that relies on application
>> of the pattern with multiple use of the view_convert expression
>> (gcc.target/i386/vect-strided-3.c).
>> 
>> Hence, the RFC: How can we make the types_match work with view_convert
>> expressions in the arguments?
> 
> You could remove the types_match (@0, @1) with
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 00988241348..820a589b577 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9539,7 +9539,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (if (VECTOR_TYPE_P (type)
>       && (VECTOR_MODE_P (TYPE_MODE (type))
>          || optimize_vectors_before_lowering_p ())
> -      && types_match (@0, @1)
> +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> +                         TYPE_SIZE (TREE_TYPE (@1)), 0)
>       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>       && multiple_p (wi::to_poly_offset (@rpos),
> @@ -9547,7 +9548,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (with
>    {
>      unsigned HOST_WIDE_INT elsz
> -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
>      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
>      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
>      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> @@ -9559,7 +9560,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    }
>    (if (!VECTOR_MODE_P (TYPE_MODE (type))
>        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel,
> false))
> -    (vec_perm @0 @1 { vec_perm_indices_to_tree
> +    (vec_perm @0 (view_convert @1) { vec_perm_indices_to_tree
>                         (build_vector_type (ssizetype, nunits), sel);
> })))))
> 
> (if (canonicalize_math_after_vectorization_p ())
> 
> or alternatively avoid the BIT_FIELD_REF (view_convert @) transform
> iff the original ref type-wise matches a vector element extract
> and the result with the view_convert does not.

Dear Richard,
thank you for the helpful feedback. I made the changes as suggested and added you as co-author.
Best,
Jennifer

This patch improves the codegen for the following test case:
uint64x2_t foo (uint64x2_t r) {
    uint32x4_t a = vreinterpretq_u32_u64 (r);
    uint32_t t;
    t = a[0]; a[0] = a[1]; a[1] = t;
    t = a[2]; a[2] = a[3]; a[3] = t;
    return vreinterpretq_u64_u32 (a);
}
from (-O1):
foo:
        mov     v31.16b, v0.16b
        ins     v0.s[0], v0.s[1]
        ins     v0.s[1], v31.s[0]
        ins     v0.s[2], v31.s[3]
        ins     v0.s[3], v31.s[2]
        ret
to:
foo:
	rev64   v0.4s, v0.4s
        ret

This is achieved by extending the following match.pd pattern to account
for type differences between @0 and @1 due to view converts.
/* Simplify vector inserts of other vector extracts to a permute.  */
(simplify
 (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
Co-authored-by: Richard Biener <rguenther@suse.de>

gcc/
	PR tree-optimization/117093
	* match.pd: Extend
	(bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
	type differences between @0 and @1 due to view converts.

gcc/testsuite/
	PR tree-optimization/117093
	* gcc.dg/tree-ssa/pr117093.c: New test.
---
 gcc/match.pd                             | 13 ++++++++-----
 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 9107e6a95ca..af6205cd9a1 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9526,7 +9526,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (VECTOR_TYPE_P (type)
       && (VECTOR_MODE_P (TYPE_MODE (type))
 	  || optimize_vectors_before_lowering_p ())
-      && types_match (@0, @1)
+      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
+			  TYPE_SIZE (TREE_TYPE (@1)), 0)
       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
       && multiple_p (wi::to_poly_offset (@rpos),
@@ -9534,7 +9535,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (with
    {
      unsigned HOST_WIDE_INT elsz
-       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
+       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
@@ -9545,9 +9546,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      vec_perm_indices sel (builder, 2, nunits);
    }
    (if (!VECTOR_MODE_P (TYPE_MODE (type))
-	|| can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
-    (vec_perm @0 @1 { vec_perm_indices_to_tree
-                        (build_vector_type (ssizetype, nunits), sel); })))))
+	|| can_vec_perm_const_p (TYPE_MODE (type),
+				 TYPE_MODE (type), sel, false))
+    (vec_perm @0 (view_convert @1)
+     { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
+				 sel); })))))
 
 (if (canonicalize_math_after_vectorization_p ())
  (for fmas (FMA)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
new file mode 100644
index 00000000000..0fea32919dd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
@@ -0,0 +1,17 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**	rev64	v0\.4s, v0\.4s
+**	ret
+*/
+uint64x2_t foo (uint64x2_t r) {
+    uint32x4_t a = vreinterpretq_u32_u64 (r);
+    uint32_t t;
+    t = a[0]; a[0] = a[1]; a[1] = t;
+    t = a[2]; a[2] = a[3]; a[3] = t;
+    return vreinterpretq_u64_u32 (a);
+}
Richard Biener Nov. 15, 2024, 11:05 a.m. UTC | #3
On Fri, 15 Nov 2024, Jennifer Schmitz wrote:

> 
> 
> > On 7 Nov 2024, at 13:47, Richard Biener <rguenther@suse.de> wrote:
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Tue, 5 Nov 2024, Jennifer Schmitz wrote:
> > 
> >> We are working on a patch to improve the codegen for the following test case:
> >> uint64x2_t foo (uint64x2_t r) {
> >>    uint32x4_t a = vreinterpretq_u32_u64 (r);
> >>    uint32_t t;
> >>    t = a[0]; a[0] = a[1]; a[1] = t;
> >>    t = a[2]; a[2] = a[3]; a[3] = t;
> >>    return vreinterpretq_u64_u32 (a);
> >> }
> >> that GCC currently compiles to (-O1):
> >> foo:
> >>        mov     v31.16b, v0.16b
> >>        ins     v0.s[0], v0.s[1]
> >>        ins     v0.s[1], v31.s[0]
> >>        ins     v0.s[2], v31.s[3]
> >>        ins     v0.s[3], v31.s[2]
> >>        ret
> >> whereas LLVM produces the preferable sequence
> >> foo:
> >>      rev64   v0.4s, v0.4s
> >>        ret
> >> 
> >> On gimple level, we currently have:
> >>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
> >>  t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
> >>  a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
> >>  a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
> >>  t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
> >>  _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
> >>  a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
> >>  a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
> >>  _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
> >>  return _10;
> >> 
> >> whereas the desired sequence is:
> >>  _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
> >>  a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
> >>  _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
> >>  return _4;
> >> 
> >> If we remove the casts from the test case, the forwprop1 dump shows that
> >> a series of match.pd is applied (repeatedly, only showing the first
> >> iteration here):
> >> Applying pattern match.pd:10881, gimple-match-1.cc:25213
> >> Applying pattern match.pd:11099, gimple-match-1.cc:25714
> >> Applying pattern match.pd:9549, gimple-match-1.cc:24274
> >> gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;
> >> 
> >> The reason why these patterns cannot be applied with casts seems to be
> >> the failing types_match (@0, @1) in the following pattern:
> >> /* Simplify vector inserts of other vector extracts to a permute.  */
> >> (simplify
> >> (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> >> (if (VECTOR_TYPE_P (type)
> >>      && (VECTOR_MODE_P (TYPE_MODE (type))
> >>          || optimize_vectors_before_lowering_p ())
> >>      && types_match (@0, @1)
> >>      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> >>      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
> >>      && multiple_p (wi::to_poly_offset (@rpos),
> >>                     wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
> >>  (with
> >>   {
> >>     [...]
> >>   }
> >>   (if (!VECTOR_MODE_P (TYPE_MODE (type))
> >>        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
> >>    (vec_perm @0 @1 { vec_perm_indices_to_tree
> >>                        (build_vector_type (ssizetype, nunits), sel); })))))
> >> 
> >> The types_match fails, because the following pattern has already removed the
> >> view_convert expression, thereby changing the type of @0:
> >> (simplify
> >> (BIT_FIELD_REF (view_convert @0) @1 @2)
> >>  [...]
> >>  (BIT_FIELD_REF @0 @1 @2)))
> >> 
> >> One attempt to make the types_match true was to add a single_use flag to
> >> the view_convert expression in the pattern above, preventing it from
> >> being applied.
> >> While this actually fixed the test case and produced the intended
> >> instruction sequence, it caused another test to fail that relies on application
> >> of the pattern with multiple use of the view_convert expression
> >> (gcc.target/i386/vect-strided-3.c).
> >> 
> >> Hence, the RFC: How can we make the types_match work with view_convert
> >> expressions in the arguments?
> > 
> > You could remove the types_match (@0, @1) with
> > 
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 00988241348..820a589b577 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -9539,7 +9539,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  (if (VECTOR_TYPE_P (type)
> >       && (VECTOR_MODE_P (TYPE_MODE (type))
> >          || optimize_vectors_before_lowering_p ())
> > -      && types_match (@0, @1)
> > +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> > +                         TYPE_SIZE (TREE_TYPE (@1)), 0)
> >       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
> >       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
> >       && multiple_p (wi::to_poly_offset (@rpos),
> > @@ -9547,7 +9548,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (with
> >    {
> >      unsigned HOST_WIDE_INT elsz
> > -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> > +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
> >      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
> >      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
> >      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> > @@ -9559,7 +9560,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >    }
> >    (if (!VECTOR_MODE_P (TYPE_MODE (type))
> >        || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel,
> > false))
> > -    (vec_perm @0 @1 { vec_perm_indices_to_tree
> > +    (vec_perm @0 (view_convert @1) { vec_perm_indices_to_tree
> >                         (build_vector_type (ssizetype, nunits), sel);
> > })))))
> > 
> > (if (canonicalize_math_after_vectorization_p ())
> > 
> > or alternatively avoid the BIT_FIELD_REF (view_convert @) transform
> > iff the original ref type-wise matches a vector element extract
> > and the result with the view_convert does not.
> 
> Dear Richard,
> thank you for the helpful feedback. I made the changes as suggested and added you as co-author.
> Best,
> Jennifer
> 
> This patch improves the codegen for the following test case:
> uint64x2_t foo (uint64x2_t r) {
>     uint32x4_t a = vreinterpretq_u32_u64 (r);
>     uint32_t t;
>     t = a[0]; a[0] = a[1]; a[1] = t;
>     t = a[2]; a[2] = a[3]; a[3] = t;
>     return vreinterpretq_u64_u32 (a);
> }
> from (-O1):
> foo:
>         mov     v31.16b, v0.16b
>         ins     v0.s[0], v0.s[1]
>         ins     v0.s[1], v31.s[0]
>         ins     v0.s[2], v31.s[3]
>         ins     v0.s[3], v31.s[2]
>         ret
> to:
> foo:
> 	rev64   v0.4s, v0.4s
>         ret
> 
> This is achieved by extending the following match.pd pattern to account
> for type differences between @0 and @1 due to view converts.
> /* Simplify vector inserts of other vector extracts to a permute.  */
> (simplify
>  (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu and
> x86_64-linux-gnu, no regression.
> OK for mainline?

OK.

Thanks,
Richard.

> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> Co-authored-by: Richard Biener <rguenther@suse.de>
> 
> gcc/
> 	PR tree-optimization/117093
> 	* match.pd: Extend
> 	(bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
> 	type differences between @0 and @1 due to view converts.
> 
> gcc/testsuite/
> 	PR tree-optimization/117093
> 	* gcc.dg/tree-ssa/pr117093.c: New test.
> ---
>  gcc/match.pd                             | 13 ++++++++-----
>  gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
>  2 files changed, 25 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 9107e6a95ca..af6205cd9a1 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9526,7 +9526,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (VECTOR_TYPE_P (type)
>        && (VECTOR_MODE_P (TYPE_MODE (type))
>  	  || optimize_vectors_before_lowering_p ())
> -      && types_match (@0, @1)
> +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> +			  TYPE_SIZE (TREE_TYPE (@1)), 0)
>        && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>        && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>        && multiple_p (wi::to_poly_offset (@rpos),
> @@ -9534,7 +9535,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>    (with
>     {
>       unsigned HOST_WIDE_INT elsz
> -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
>       poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
>       poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
>       unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
> @@ -9545,9 +9546,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>       vec_perm_indices sel (builder, 2, nunits);
>     }
>     (if (!VECTOR_MODE_P (TYPE_MODE (type))
> -	|| can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
> -    (vec_perm @0 @1 { vec_perm_indices_to_tree
> -                        (build_vector_type (ssizetype, nunits), sel); })))))
> +	|| can_vec_perm_const_p (TYPE_MODE (type),
> +				 TYPE_MODE (type), sel, false))
> +    (vec_perm @0 (view_convert @1)
> +     { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
> +				 sel); })))))
>  
>  (if (canonicalize_math_after_vectorization_p ())
>   (for fmas (FMA)
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> new file mode 100644
> index 00000000000..0fea32919dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
> @@ -0,0 +1,17 @@
> +/* { dg-final { check-function-bodies "**" "" } } */
> +/* { dg-options "-O1" } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** foo:
> +**	rev64	v0\.4s, v0\.4s
> +**	ret
> +*/
> +uint64x2_t foo (uint64x2_t r) {
> +    uint32x4_t a = vreinterpretq_u32_u64 (r);
> +    uint32_t t;
> +    t = a[0]; a[0] = a[1]; a[1] = t;
> +    t = a[2]; a[2] = a[3]; a[3] = t;
> +    return vreinterpretq_u64_u32 (a);
> +}
>
Jennifer Schmitz Nov. 15, 2024, 6:01 p.m. UTC | #4
> On 15 Nov 2024, at 12:05, Richard Biener <rguenther@suse.de> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Fri, 15 Nov 2024, Jennifer Schmitz wrote:
> 
>> 
>> 
>>> On 7 Nov 2024, at 13:47, Richard Biener <rguenther@suse.de> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> On Tue, 5 Nov 2024, Jennifer Schmitz wrote:
>>> 
>>>> We are working on a patch to improve the codegen for the following test case:
>>>> uint64x2_t foo (uint64x2_t r) {
>>>>   uint32x4_t a = vreinterpretq_u32_u64 (r);
>>>>   uint32_t t;
>>>>   t = a[0]; a[0] = a[1]; a[1] = t;
>>>>   t = a[2]; a[2] = a[3]; a[3] = t;
>>>>   return vreinterpretq_u64_u32 (a);
>>>> }
>>>> that GCC currently compiles to (-O1):
>>>> foo:
>>>>       mov     v31.16b, v0.16b
>>>>       ins     v0.s[0], v0.s[1]
>>>>       ins     v0.s[1], v31.s[0]
>>>>       ins     v0.s[2], v31.s[3]
>>>>       ins     v0.s[3], v31.s[2]
>>>>       ret
>>>> whereas LLVM produces the preferable sequence
>>>> foo:
>>>>     rev64   v0.4s, v0.4s
>>>>       ret
>>>> 
>>>> On gimple level, we currently have:
>>>> _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_3(D));
>>>> t_4 = BIT_FIELD_REF <r_3(D), 32, 0>;
>>>> a_5 = VEC_PERM_EXPR <_1, _1, { 1, 1, 2, 3 }>;
>>>> a_6 = BIT_INSERT_EXPR <a_5, t_4, 32 (32 bits)>;
>>>> t_7 = BIT_FIELD_REF <r_3(D), 32, 64>;
>>>> _2 = BIT_FIELD_REF <r_3(D), 32, 96>;
>>>> a_8 = BIT_INSERT_EXPR <a_6, _2, 64 (32 bits)>;
>>>> a_9 = BIT_INSERT_EXPR <a_8, t_7, 96 (32 bits)>;
>>>> _10 = VIEW_CONVERT_EXPR<uint64x2_t>(a_9);
>>>> return _10;
>>>> 
>>>> whereas the desired sequence is:
>>>> _1 = VIEW_CONVERT_EXPR<uint32x4_t>(r_2(D));
>>>> a_3 = VEC_PERM_EXPR <_1, _1, { 1, 0, 3, 2 }>;
>>>> _4 = VIEW_CONVERT_EXPR<uint64x2_t>(a_3);
>>>> return _4;
>>>> 
>>>> If we remove the casts from the test case, the forwprop1 dump shows that
>>>> a series of match.pd is applied (repeatedly, only showing the first
>>>> iteration here):
>>>> Applying pattern match.pd:10881, gimple-match-1.cc:25213
>>>> Applying pattern match.pd:11099, gimple-match-1.cc:25714
>>>> Applying pattern match.pd:9549, gimple-match-1.cc:24274
>>>> gimple_simplified to a_7 = VEC_PERM_EXPR <r_3(D), r_3(D), { 1, 0, 2, 3 }>;
>>>> 
>>>> The reason why these patterns cannot be applied with casts seems to be
>>>> the failing types_match (@0, @1) in the following pattern:
>>>> /* Simplify vector inserts of other vector extracts to a permute.  */
>>>> (simplify
>>>> (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
>>>> (if (VECTOR_TYPE_P (type)
>>>>     && (VECTOR_MODE_P (TYPE_MODE (type))
>>>>         || optimize_vectors_before_lowering_p ())
>>>>     && types_match (@0, @1)
>>>>     && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>>>>     && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>>>>     && multiple_p (wi::to_poly_offset (@rpos),
>>>>                    wi::to_poly_offset (TYPE_SIZE (TREE_TYPE (type)))))
>>>> (with
>>>>  {
>>>>    [...]
>>>>  }
>>>>  (if (!VECTOR_MODE_P (TYPE_MODE (type))
>>>>       || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
>>>>   (vec_perm @0 @1 { vec_perm_indices_to_tree
>>>>                       (build_vector_type (ssizetype, nunits), sel); })))))
>>>> 
>>>> The types_match fails, because the following pattern has already removed the
>>>> view_convert expression, thereby changing the type of @0:
>>>> (simplify
>>>> (BIT_FIELD_REF (view_convert @0) @1 @2)
>>>> [...]
>>>> (BIT_FIELD_REF @0 @1 @2)))
>>>> 
>>>> One attempt to make the types_match true was to add a single_use flag to
>>>> the view_convert expression in the pattern above, preventing it from
>>>> being applied.
>>>> While this actually fixed the test case and produced the intended
>>>> instruction sequence, it caused another test to fail that relies on application
>>>> of the pattern with multiple use of the view_convert expression
>>>> (gcc.target/i386/vect-strided-3.c).
>>>> 
>>>> Hence, the RFC: How can we make the types_match work with view_convert
>>>> expressions in the arguments?
>>> 
>>> You could remove the types_match (@0, @1) with
>>> 
>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>> index 00988241348..820a589b577 100644
>>> --- a/gcc/match.pd
>>> +++ b/gcc/match.pd
>>> @@ -9539,7 +9539,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>> (if (VECTOR_TYPE_P (type)
>>>      && (VECTOR_MODE_P (TYPE_MODE (type))
>>>         || optimize_vectors_before_lowering_p ())
>>> -      && types_match (@0, @1)
>>> +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
>>> +                         TYPE_SIZE (TREE_TYPE (@1)), 0)
>>>      && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>>>      && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>>>      && multiple_p (wi::to_poly_offset (@rpos),
>>> @@ -9547,7 +9548,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>  (with
>>>   {
>>>     unsigned HOST_WIDE_INT elsz
>>> -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
>>> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
>>>     poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
>>>     poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
>>>     unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
>>> @@ -9559,7 +9560,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>   }
>>>   (if (!VECTOR_MODE_P (TYPE_MODE (type))
>>>       || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel,
>>> false))
>>> -    (vec_perm @0 @1 { vec_perm_indices_to_tree
>>> +    (vec_perm @0 (view_convert @1) { vec_perm_indices_to_tree
>>>                        (build_vector_type (ssizetype, nunits), sel);
>>> })))))
>>> 
>>> (if (canonicalize_math_after_vectorization_p ())
>>> 
>>> or alternatively avoid the BIT_FIELD_REF (view_convert @) transform
>>> iff the original ref type-wise matches a vector element extract
>>> and the result with the view_convert does not.
>> 
>> Dear Richard,
>> thank you for the helpful feedback. I made the changes as suggested and added you as co-author.
>> Best,
>> Jennifer
>> 
>> This patch improves the codegen for the following test case:
>> uint64x2_t foo (uint64x2_t r) {
>>    uint32x4_t a = vreinterpretq_u32_u64 (r);
>>    uint32_t t;
>>    t = a[0]; a[0] = a[1]; a[1] = t;
>>    t = a[2]; a[2] = a[3]; a[3] = t;
>>    return vreinterpretq_u64_u32 (a);
>> }
>> from (-O1):
>> foo:
>>        mov     v31.16b, v0.16b
>>        ins     v0.s[0], v0.s[1]
>>        ins     v0.s[1], v31.s[0]
>>        ins     v0.s[2], v31.s[3]
>>        ins     v0.s[3], v31.s[2]
>>        ret
>> to:
>> foo:
>>      rev64   v0.4s, v0.4s
>>        ret
>> 
>> This is achieved by extending the following match.pd pattern to account
>> for type differences between @0 and @1 due to view converts.
>> /* Simplify vector inserts of other vector extracts to a permute.  */
>> (simplify
>> (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)
>> 
>> The patch was bootstrapped and regtested on aarch64-linux-gnu and
>> x86_64-linux-gnu, no regression.
>> OK for mainline?
> 
> OK.
Thanks, committed with c83e2d47574fd9a21f257e0f0d7e350c3f1b0618.
Regards,
Jennifer
> 
> Thanks,
> Richard.
> 
>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> Co-authored-by: Richard Biener <rguenther@suse.de>
>> 
>> gcc/
>>      PR tree-optimization/117093
>>      * match.pd: Extend
>>      (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
>>      type differences between @0 and @1 due to view converts.
>> 
>> gcc/testsuite/
>>      PR tree-optimization/117093
>>      * gcc.dg/tree-ssa/pr117093.c: New test.
>> ---
>> gcc/match.pd                             | 13 ++++++++-----
>> gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +++++++++++++++++
>> 2 files changed, 25 insertions(+), 5 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
>> 
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 9107e6a95ca..af6205cd9a1 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -9526,7 +9526,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>  (if (VECTOR_TYPE_P (type)
>>       && (VECTOR_MODE_P (TYPE_MODE (type))
>>        || optimize_vectors_before_lowering_p ())
>> -      && types_match (@0, @1)
>> +      && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
>> +                       TYPE_SIZE (TREE_TYPE (@1)), 0)
>>       && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
>>       && TYPE_VECTOR_SUBPARTS (type).is_constant ()
>>       && multiple_p (wi::to_poly_offset (@rpos),
>> @@ -9534,7 +9535,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>   (with
>>    {
>>      unsigned HOST_WIDE_INT elsz
>> -       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1))));
>> +       = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0))));
>>      poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
>>      poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
>>      unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
>> @@ -9545,9 +9546,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>      vec_perm_indices sel (builder, 2, nunits);
>>    }
>>    (if (!VECTOR_MODE_P (TYPE_MODE (type))
>> -     || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, false))
>> -    (vec_perm @0 @1 { vec_perm_indices_to_tree
>> -                        (build_vector_type (ssizetype, nunits), sel); })))))
>> +     || can_vec_perm_const_p (TYPE_MODE (type),
>> +                              TYPE_MODE (type), sel, false))
>> +    (vec_perm @0 (view_convert @1)
>> +     { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
>> +                              sel); })))))
>> 
>> (if (canonicalize_math_after_vectorization_p ())
>>  (for fmas (FMA)
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
>> new file mode 100644
>> index 00000000000..0fea32919dd
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
>> @@ -0,0 +1,17 @@
>> +/* { dg-final { check-function-bodies "**" "" } } */
>> +/* { dg-options "-O1" } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** foo:
>> +**   rev64   v0\.4s, v0\.4s
>> +**   ret
>> +*/
>> +uint64x2_t foo (uint64x2_t r) {
>> +    uint32x4_t a = vreinterpretq_u32_u64 (r);
>> +    uint32_t t;
>> +    t = a[0]; a[0] = a[1]; a[1] = t;
>> +    t = a[2]; a[2] = a[3]; a[3] = t;
>> +    return vreinterpretq_u64_u32 (a);
>> +}
>> 
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 9107e6a95ca..d7957177027 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9357,9 +9357,10 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (BIT_FIELD_REF @0 @3 { const_binop (PLUS_EXPR, bitsizetype, @2, @4); }))
 
 (simplify
- (BIT_FIELD_REF (view_convert @0) @1 @2)
- (if (! INTEGRAL_TYPE_P (TREE_TYPE (@0))
-      || type_has_mode_precision_p (TREE_TYPE (@0)))
+ (BIT_FIELD_REF (view_convert@3 @0) @1 @2)
+ (if ((! INTEGRAL_TYPE_P (TREE_TYPE (@0))
+       || type_has_mode_precision_p (TREE_TYPE (@0)))
+      && single_use (@3))
   (BIT_FIELD_REF @0 @1 @2)))
 
 (simplify
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
new file mode 100644
index 00000000000..0fea32919dd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
@@ -0,0 +1,17 @@ 
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O1" } */
+
+#include <arm_neon.h>
+
+/*
+** foo:
+**	rev64	v0\.4s, v0\.4s
+**	ret
+*/
+uint64x2_t foo (uint64x2_t r) {
+    uint32x4_t a = vreinterpretq_u32_u64 (r);
+    uint32_t t;
+    t = a[0]; a[0] = a[1]; a[1] = t;
+    t = a[2]; a[2] = a[3]; a[3] = t;
+    return vreinterpretq_u64_u32 (a);
+}