diff mbox series

[v1,1/4] Match: Support form 1 for vector signed integer SAT_SUB

Message ID 20241011062245.2486653-1-pan2.li@intel.com
State New
Headers show
Series [v1,1/4] Match: Support form 1 for vector signed integer SAT_SUB | expand

Commit Message

Li, Pan2 Oct. 11, 2024, 6:22 a.m. UTC
From: Pan Li <pan2.li@intel.com>

This patch would like to support the form 1 of the vector signed
integer SAT_SUB.  Aka below example:

Form 1:
  #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX)                     \
  void __attribute__((noinline))                                       \
  vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
  {                                                                    \
    unsigned i;                                                        \
    for (i = 0; i < limit; i++)                                        \
      {                                                                \
        T x = op_1[i];                                                 \
        T y = op_2[i];                                                 \
        T minus = (UT)x - (UT)y;                                       \
        out[i] = (x ^ y) >= 0                                          \
          ? minus                                                      \
          : (minus ^ x) >= 0                                           \
            ? minus                                                    \
            : x < 0 ? MIN : MAX;                                       \
      }                                                                \
  }

DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)

Before this patch:
  91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
  92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... }, _108, 0);
  93   │   _69 = vect_x_16.11_80 >> 7;
  94   │   vect_x.12_81 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_x_16.11_80);
  95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... }, _108, 0);
  96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
  97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
  98   │   vect_y.16_86 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_y_18.15_85);
  99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
 100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR<vector([16,16]) signed char>(vect__6.17_87);
 101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
 102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
 103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
 104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, vect_minus_19.18_88);
 105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
 106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
 107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
 108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
 109   │   ivtmp_107 = ivtmp_106 - _108;

After this patch:
  81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
  82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... }, _102, 0);
  83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... }, _102, 0);
  84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
  85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, vect_patt_38.15_94);
  86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
  87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
  88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
  89   │   ivtmp_101 = ivtmp_100 - _102;

The below test suites are passed for this patch.
* The rv64gcv fully regression test.
* The x86 bootstrap test.
* The x86 fully regression test.

gcc/ChangeLog:

	* match.pd: Add case 1 matching pattern for vector signed SAT_SUB.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/match.pd | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

Comments

Richard Biener Oct. 11, 2024, 9:09 a.m. UTC | #1
On Fri, Oct 11, 2024 at 8:24 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to support the form 1 of the vector signed
> integer SAT_SUB.  Aka below example:
>
> Form 1:
>   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX)                     \
>   void __attribute__((noinline))                                       \
>   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
>   {                                                                    \
>     unsigned i;                                                        \
>     for (i = 0; i < limit; i++)                                        \
>       {                                                                \
>         T x = op_1[i];                                                 \
>         T y = op_2[i];                                                 \
>         T minus = (UT)x - (UT)y;                                       \
>         out[i] = (x ^ y) >= 0                                          \
>           ? minus                                                      \
>           : (minus ^ x) >= 0                                           \
>             ? minus                                                    \
>             : x < 0 ? MIN : MAX;                                       \
>       }                                                                \
>   }
>
> DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
>   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... }, _108, 0);
>   93   │   _69 = vect_x_16.11_80 >> 7;
>   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_x_16.11_80);
>   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... }, _108, 0);
>   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
>   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
>   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_y_18.15_85);
>   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
>  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR<vector([16,16]) signed char>(vect__6.17_87);
>  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
>  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
>  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
>  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, vect_minus_19.18_88);
>  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
>  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
>  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
>  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
>  109   │   ivtmp_107 = ivtmp_106 - _108;
>
> After this patch:
>   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
>   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... }, _102, 0);
>   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... }, _102, 0);
>   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
>   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, vect_patt_38.15_94);
>   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
>   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
>   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
>   89   │   ivtmp_101 = ivtmp_100 - _102;
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

I wonder since we now can match many different variants of writing
signed and unsigned
saturation add and sub whether it makes sense to canonicalize to the "cheapest"
variant when the target doesn't support .SAT_SUB/ADD?  Are there any
"sub-patterns"
not forming the full saturation add/sub that can be
simplified/canonicalized in such
way maybe?

> gcc/ChangeLog:
>
>         * match.pd: Add case 1 matching pattern for vector signed SAT_SUB.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 8a7569ce387..a3c298d3a22 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3401,6 +3401,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
>        && types_match (type, @0, @1))))
>
> +/* Signed saturation sub, case 4:
> +   T minus = (T)((UT)X - (UT)Y);
> +   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
> +
> +   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
> +(match (signed_integer_sat_sub @0 @1)
> + (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
> +                                                        (nop_convert @1))))
> +                      integer_zerop)
> +                  (lt (bit_xor:c @0 @1) integer_zerop))
> +       (bit_xor:c (nop_convert (negate (nop_convert (convert
> +                                                     (lt @0 integer_zerop)))))
> +                  max_value)
> +       @2)
> + (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type))))
> +
>  /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
>     SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
>  (match (unsigned_integer_sat_trunc @0)
> --
> 2.43.0
>
Li, Pan2 Oct. 11, 2024, 9:44 a.m. UTC | #2
Thanks Richard for reviewing and comments.

> I wonder since we now can match many different variants of writing
> signed and unsigned
> saturation add and sub whether it makes sense to canonicalize to the "cheapest"
> variant when the target doesn't support .SAT_SUB/ADD?  

I think it is a good point. But sorry, not sure if I get the point here. Like what is the purpose of 
the "cheapest" variant regardless of target support it or not. You mean for a "cheapest" variant
we can expand it in the middle end? Instead of leave it to the target.

> Are there any
> "sub-patterns"
> not forming the full saturation add/sub that can be
> simplified/canonicalized in such
> way maybe?

Yes, you are right. There will be some common sub-pattern for so many saturation alu variants.
Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu are supported
(to make sure we have full picture).

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Friday, October 11, 2024 5:10 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

On Fri, Oct 11, 2024 at 8:24 AM <pan2.li@intel.com> wrote:
>
> From: Pan Li <pan2.li@intel.com>
>
> This patch would like to support the form 1 of the vector signed
> integer SAT_SUB.  Aka below example:
>
> Form 1:
>   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX)                     \
>   void __attribute__((noinline))                                       \
>   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
>   {                                                                    \
>     unsigned i;                                                        \
>     for (i = 0; i < limit; i++)                                        \
>       {                                                                \
>         T x = op_1[i];                                                 \
>         T y = op_2[i];                                                 \
>         T minus = (UT)x - (UT)y;                                       \
>         out[i] = (x ^ y) >= 0                                          \
>           ? minus                                                      \
>           : (minus ^ x) >= 0                                           \
>             ? minus                                                    \
>             : x < 0 ? MIN : MAX;                                       \
>       }                                                                \
>   }
>
> DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
>
> Before this patch:
>   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
>   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... }, _108, 0);
>   93   │   _69 = vect_x_16.11_80 >> 7;
>   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_x_16.11_80);
>   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... }, _108, 0);
>   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
>   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
>   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_y_18.15_85);
>   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
>  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR<vector([16,16]) signed char>(vect__6.17_87);
>  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
>  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
>  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
>  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, vect_minus_19.18_88);
>  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
>  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
>  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
>  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
>  109   │   ivtmp_107 = ivtmp_106 - _108;
>
> After this patch:
>   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
>   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... }, _102, 0);
>   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... }, _102, 0);
>   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
>   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, vect_patt_38.15_94);
>   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
>   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
>   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
>   89   │   ivtmp_101 = ivtmp_100 - _102;
>
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

OK.

I wonder since we now can match many different variants of writing
signed and unsigned
saturation add and sub whether it makes sense to canonicalize to the "cheapest"
variant when the target doesn't support .SAT_SUB/ADD?  Are there any
"sub-patterns"
not forming the full saturation add/sub that can be
simplified/canonicalized in such
way maybe?

> gcc/ChangeLog:
>
>         * match.pd: Add case 1 matching pattern for vector signed SAT_SUB.
>
> Signed-off-by: Pan Li <pan2.li@intel.com>
> ---
>  gcc/match.pd | 16 ++++++++++++++++
>  1 file changed, 16 insertions(+)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 8a7569ce387..a3c298d3a22 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3401,6 +3401,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
>        && types_match (type, @0, @1))))
>
> +/* Signed saturation sub, case 4:
> +   T minus = (T)((UT)X - (UT)Y);
> +   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
> +
> +   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
> +(match (signed_integer_sat_sub @0 @1)
> + (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
> +                                                        (nop_convert @1))))
> +                      integer_zerop)
> +                  (lt (bit_xor:c @0 @1) integer_zerop))
> +       (bit_xor:c (nop_convert (negate (nop_convert (convert
> +                                                     (lt @0 integer_zerop)))))
> +                  max_value)
> +       @2)
> + (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type))))
> +
>  /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
>     SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
>  (match (unsigned_integer_sat_trunc @0)
> --
> 2.43.0
>
Richard Biener Oct. 11, 2024, 10:27 a.m. UTC | #3
On Fri, Oct 11, 2024 at 11:44 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for reviewing and comments.
>
> > I wonder since we now can match many different variants of writing
> > signed and unsigned
> > saturation add and sub whether it makes sense to canonicalize to the "cheapest"
> > variant when the target doesn't support .SAT_SUB/ADD?
>
> I think it is a good point. But sorry, not sure if I get the point here. Like what is the purpose of
> the "cheapest" variant regardless of target support it or not. You mean for a "cheapest" variant
> we can expand it in the middle end? Instead of leave it to the target.

Yes.  The different variants seem to have different complexity and generic
expansion might prefer one or another version.  So I wasn't suggesting to
expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
to the cheapest (smallest) variant.

> > Are there any
> > "sub-patterns"
> > not forming the full saturation add/sub that can be
> > simplified/canonicalized in such
> > way maybe?
>
> Yes, you are right. There will be some common sub-pattern for so many saturation alu variants.
> Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu are supported
> (to make sure we have full picture).

Yeah, having a full picture is good.

Richard.

> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, October 11, 2024 5:10 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB
>
> On Fri, Oct 11, 2024 at 8:24 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch would like to support the form 1 of the vector signed
> > integer SAT_SUB.  Aka below example:
> >
> > Form 1:
> >   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX)                     \
> >   void __attribute__((noinline))                                       \
> >   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
> >   {                                                                    \
> >     unsigned i;                                                        \
> >     for (i = 0; i < limit; i++)                                        \
> >       {                                                                \
> >         T x = op_1[i];                                                 \
> >         T y = op_2[i];                                                 \
> >         T minus = (UT)x - (UT)y;                                       \
> >         out[i] = (x ^ y) >= 0                                          \
> >           ? minus                                                      \
> >           : (minus ^ x) >= 0                                           \
> >             ? minus                                                    \
> >             : x < 0 ? MIN : MAX;                                       \
> >       }                                                                \
> >   }
> >
> > DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
> >
> > Before this patch:
> >   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
> >   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... }, _108, 0);
> >   93   │   _69 = vect_x_16.11_80 >> 7;
> >   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_x_16.11_80);
> >   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... }, _108, 0);
> >   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
> >   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
> >   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_y_18.15_85);
> >   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
> >  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR<vector([16,16]) signed char>(vect__6.17_87);
> >  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
> >  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
> >  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
> >  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, vect_minus_19.18_88);
> >  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
> >  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
> >  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
> >  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
> >  109   │   ivtmp_107 = ivtmp_106 - _108;
> >
> > After this patch:
> >   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
> >   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... }, _102, 0);
> >   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... }, _102, 0);
> >   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
> >   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, vect_patt_38.15_94);
> >   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
> >   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
> >   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
> >   89   │   ivtmp_101 = ivtmp_100 - _102;
> >
> > The below test suites are passed for this patch.
> > * The rv64gcv fully regression test.
> > * The x86 bootstrap test.
> > * The x86 fully regression test.
>
> OK.
>
> I wonder since we now can match many different variants of writing
> signed and unsigned
> saturation add and sub whether it makes sense to canonicalize to the "cheapest"
> variant when the target doesn't support .SAT_SUB/ADD?  Are there any
> "sub-patterns"
> not forming the full saturation add/sub that can be
> simplified/canonicalized in such
> way maybe?
>
> > gcc/ChangeLog:
> >
> >         * match.pd: Add case 1 matching pattern for vector signed SAT_SUB.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd | 16 ++++++++++++++++
> >  1 file changed, 16 insertions(+)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 8a7569ce387..a3c298d3a22 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3401,6 +3401,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
> >        && types_match (type, @0, @1))))
> >
> > +/* Signed saturation sub, case 4:
> > +   T minus = (T)((UT)X - (UT)Y);
> > +   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
> > +
> > +   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
> > +(match (signed_integer_sat_sub @0 @1)
> > + (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
> > +                                                        (nop_convert @1))))
> > +                      integer_zerop)
> > +                  (lt (bit_xor:c @0 @1) integer_zerop))
> > +       (bit_xor:c (nop_convert (negate (nop_convert (convert
> > +                                                     (lt @0 integer_zerop)))))
> > +                  max_value)
> > +       @2)
> > + (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type))))
> > +
> >  /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
> >     SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
> >  (match (unsigned_integer_sat_trunc @0)
> > --
> > 2.43.0
> >
Li, Pan2 Oct. 11, 2024, 11:33 p.m. UTC | #4
Thanks Richard for explaining.

> Yes.  The different variants seem to have different complexity and generic
> expansion might prefer one or another version.  So I wasn't suggesting to
> expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
> to the cheapest (smallest) variant.

Got it. There are many variants but we can simplify them to the cheapest one.
I will have a try after all saturation alu supported.

Pan

-----Original Message-----
From: Richard Biener <richard.guenther@gmail.com> 
Sent: Friday, October 11, 2024 6:27 PM
To: Li, Pan2 <pan2.li@intel.com>
Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB

On Fri, Oct 11, 2024 at 11:44 AM Li, Pan2 <pan2.li@intel.com> wrote:
>
> Thanks Richard for reviewing and comments.
>
> > I wonder since we now can match many different variants of writing
> > signed and unsigned
> > saturation add and sub whether it makes sense to canonicalize to the "cheapest"
> > variant when the target doesn't support .SAT_SUB/ADD?
>
> I think it is a good point. But sorry, not sure if I get the point here. Like what is the purpose of
> the "cheapest" variant regardless of target support it or not. You mean for a "cheapest" variant
> we can expand it in the middle end? Instead of leave it to the target.

Yes.  The different variants seem to have different complexity and generic
expansion might prefer one or another version.  So I wasn't suggesting to
expand .SAT_ADD/SUB in the middle-end but instead canonicalize the open-coding
to the cheapest (smallest) variant.

> > Are there any
> > "sub-patterns"
> > not forming the full saturation add/sub that can be
> > simplified/canonicalized in such
> > way maybe?
>
> Yes, you are right. There will be some common sub-pattern for so many saturation alu variants.
> Like x < 0 ? MIN : MAX. I plan to refine this part after all saturation alu are supported
> (to make sure we have full picture).

Yeah, having a full picture is good.

Richard.

> Pan
>
> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Friday, October 11, 2024 5:10 PM
> To: Li, Pan2 <pan2.li@intel.com>
> Cc: gcc-patches@gcc.gnu.org; Tamar.Christina@arm.com; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; rdapp.gcc@gmail.com
> Subject: Re: [PATCH v1 1/4] Match: Support form 1 for vector signed integer SAT_SUB
>
> On Fri, Oct 11, 2024 at 8:24 AM <pan2.li@intel.com> wrote:
> >
> > From: Pan Li <pan2.li@intel.com>
> >
> > This patch would like to support the form 1 of the vector signed
> > integer SAT_SUB.  Aka below example:
> >
> > Form 1:
> >   #define DEF_VEC_SAT_S_SUB_FMT_1(T, UT, MIN, MAX)                     \
> >   void __attribute__((noinline))                                       \
> >   vec_sat_s_add_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
> >   {                                                                    \
> >     unsigned i;                                                        \
> >     for (i = 0; i < limit; i++)                                        \
> >       {                                                                \
> >         T x = op_1[i];                                                 \
> >         T y = op_2[i];                                                 \
> >         T minus = (UT)x - (UT)y;                                       \
> >         out[i] = (x ^ y) >= 0                                          \
> >           ? minus                                                      \
> >           : (minus ^ x) >= 0                                           \
> >             ? minus                                                    \
> >             : x < 0 ? MIN : MAX;                                       \
> >       }                                                                \
> >   }
> >
> > DEF_VEC_SAT_S_SUB_FMT_1(int8_t, uint8_t, INT8_MIN, INT8_MAX)
> >
> > Before this patch:
> >   91   │   _108 = .SELECT_VL (ivtmp_106, POLY_INT_CST [16, 16]);
> >   92   │   vect_x_16.11_80 = .MASK_LEN_LOAD (vectp_op_1.9_78, 8B, { -1, ... }, _108, 0);
> >   93   │   _69 = vect_x_16.11_80 >> 7;
> >   94   │   vect_x.12_81 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_x_16.11_80);
> >   95   │   vect_y_18.15_85 = .MASK_LEN_LOAD (vectp_op_2.13_83, 8B, { -1, ... }, _108, 0);
> >   96   │   vect__7.21_91 = vect_x_16.11_80 ^ vect_y_18.15_85;
> >   97   │   mask__44.22_92 = vect__7.21_91 < { 0, ... };
> >   98   │   vect_y.16_86 = VIEW_CONVERT_EXPR<vector([16,16]) unsigned char>(vect_y_18.15_85);
> >   99   │   vect__6.17_87 = vect_x.12_81 - vect_y.16_86;
> >  100   │   vect_minus_19.18_88 = VIEW_CONVERT_EXPR<vector([16,16]) signed char>(vect__6.17_87);
> >  101   │   vect__8.19_89 = vect_x_16.11_80 ^ vect_minus_19.18_88;
> >  102   │   mask__42.20_90 = vect__8.19_89 < { 0, ... };
> >  103   │   mask__41.23_93 = mask__42.20_90 & mask__44.22_92;
> >  104   │   _4 = .COND_XOR (mask__41.23_93, _69, { 127, ... }, vect_minus_19.18_88);
> >  105   │   .MASK_LEN_STORE (vectp_out.31_102, 8B, { -1, ... }, _108, 0, _4);
> >  106   │   vectp_op_1.9_79 = vectp_op_1.9_78 + _108;
> >  107   │   vectp_op_2.13_84 = vectp_op_2.13_83 + _108;
> >  108   │   vectp_out.31_103 = vectp_out.31_102 + _108;
> >  109   │   ivtmp_107 = ivtmp_106 - _108;
> >
> > After this patch:
> >   81   │   _102 = .SELECT_VL (ivtmp_100, POLY_INT_CST [16, 16]);
> >   82   │   vect_x_16.11_89 = .MASK_LEN_LOAD (vectp_op_1.9_87, 8B, { -1, ... }, _102, 0);
> >   83   │   vect_y_18.14_93 = .MASK_LEN_LOAD (vectp_op_2.12_91, 8B, { -1, ... }, _102, 0);
> >   84   │   vect_patt_38.15_94 = .SAT_SUB (vect_x_16.11_89, vect_y_18.14_93);
> >   85   │   .MASK_LEN_STORE (vectp_out.16_96, 8B, { -1, ... }, _102, 0, vect_patt_38.15_94);
> >   86   │   vectp_op_1.9_88 = vectp_op_1.9_87 + _102;
> >   87   │   vectp_op_2.12_92 = vectp_op_2.12_91 + _102;
> >   88   │   vectp_out.16_97 = vectp_out.16_96 + _102;
> >   89   │   ivtmp_101 = ivtmp_100 - _102;
> >
> > The below test suites are passed for this patch.
> > * The rv64gcv fully regression test.
> > * The x86 bootstrap test.
> > * The x86 fully regression test.
>
> OK.
>
> I wonder since we now can match many different variants of writing
> signed and unsigned
> saturation add and sub whether it makes sense to canonicalize to the "cheapest"
> variant when the target doesn't support .SAT_SUB/ADD?  Are there any
> "sub-patterns"
> not forming the full saturation add/sub that can be
> simplified/canonicalized in such
> way maybe?
>
> > gcc/ChangeLog:
> >
> >         * match.pd: Add case 1 matching pattern for vector signed SAT_SUB.
> >
> > Signed-off-by: Pan Li <pan2.li@intel.com>
> > ---
> >  gcc/match.pd | 16 ++++++++++++++++
> >  1 file changed, 16 insertions(+)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index 8a7569ce387..a3c298d3a22 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3401,6 +3401,22 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
> >        && types_match (type, @0, @1))))
> >
> > +/* Signed saturation sub, case 4:
> > +   T minus = (T)((UT)X - (UT)Y);
> > +   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
> > +
> > +   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
> > +(match (signed_integer_sat_sub @0 @1)
> > + (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
> > +                                                        (nop_convert @1))))
> > +                      integer_zerop)
> > +                  (lt (bit_xor:c @0 @1) integer_zerop))
> > +       (bit_xor:c (nop_convert (negate (nop_convert (convert
> > +                                                     (lt @0 integer_zerop)))))
> > +                  max_value)
> > +       @2)
> > + (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type))))
> > +
> >  /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
> >     SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
> >  (match (unsigned_integer_sat_trunc @0)
> > --
> > 2.43.0
> >
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 8a7569ce387..a3c298d3a22 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3401,6 +3401,22 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
       && types_match (type, @0, @1))))
 
+/* Signed saturation sub, case 4:
+   T minus = (T)((UT)X - (UT)Y);
+   SAT_S_SUB = (X ^ Y) < 0 & (X ^ minus) < 0 ? (-(T)(X < 0) ^ MAX) : minus;
+
+   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
+(match (signed_integer_sat_sub @0 @1)
+ (cond^ (bit_and:c (lt (bit_xor @0 (nop_convert@2 (minus (nop_convert @0)
+						         (nop_convert @1))))
+		       integer_zerop)
+		   (lt (bit_xor:c @0 @1) integer_zerop))
+	(bit_xor:c (nop_convert (negate (nop_convert (convert
+						      (lt @0 integer_zerop)))))
+		   max_value)
+	@2)
+ (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type))))
+
 /* Unsigned saturation truncate, case 1, sizeof (WT) > sizeof (NT).
    SAT_U_TRUNC = (NT)x | (NT)(-(X > (WT)(NT)(-1))).  */
 (match (unsigned_integer_sat_trunc @0)