diff mbox series

Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449)

Message ID SJ2PR01MB86357ECFCF921CF3114FC894E12FA@SJ2PR01MB8635.prod.exchangelabs.com
State New
Headers show
Series Vect: use a small step to calculate induction for the unrolled loop (PR tree-optimization/110449) | expand

Commit Message

Hao Liu OS July 5, 2023, 6:43 a.m. UTC
Hi,

If a loop is unrolled by n times during vectoriation, two steps are used to
calculate the induction variable:
  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)

This patch calculates an extra vec_n to replace vec_loop:
  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.

So that we can save the large step register and related operations.

gcc/ChangeLog:

	PR tree-optimization/110449
	* tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
	vec_loop for the unrolled loop.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr110449.c: New testcase.
---
 gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
 gcc/tree-vect-loop.cc                       | 21 +++++++++--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c

Comments

Richard Biener July 6, 2023, 12:44 p.m. UTC | #1
On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi,
>
> If a loop is unrolled by n times during vectoriation, two steps are used to
> calculate the induction variable:
>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>
> This patch calculates an extra vec_n to replace vec_loop:
>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>
> So that we can save the large step register and related operations.

OK.  It would be nice to avoid the dead stmts created earlier though.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR tree-optimization/110449
>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>         vec_loop for the unrolled loop.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/pr110449.c: New testcase.
> ---
>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
>  2 files changed, 58 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> new file mode 100644
> index 00000000000..bb3b6dcfe08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> +
> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> +
> +   before (suggested_unroll_factor=2):
> +         fmov    s30, 8.0e+0
> +         fmov    s31, 4.0e+0
> +         dup     v27.4s, v30.s[0]
> +         dup     v28.4s, v31.s[0]
> +     .L6:
> +         mov     v30.16b, v31.16b
> +         fadd    v31.4s, v31.4s, v27.4s
> +         fadd    v29.4s, v30.4s, v28.4s
> +         stp     q30, q29, [x0]
> +         add     x0, x0, 32
> +         cmp     x1, x0
> +         bne     .L6
> +
> +   after:
> +         fmov    s31, 4.0e+0
> +         dup     v29.4s, v31.s[0]
> +     .L6:
> +         fadd    v30.4s, v31.4s, v29.4s
> +         stp     q31, q30, [x0]
> +         add     x0, x0, 32
> +         fadd    v31.4s, v29.4s, v30.4s
> +         cmp     x0, x1
> +         bne     .L6  */
> +
> +void
> +foo2 (float *arr, float freq, float step)
> +{
> +  for (int i = 0; i < 1024; i++)
> +    {
> +      arr[i] = freq;
> +      freq += step;
> +    }
> +}
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 3b46c58a8d8..706ecbffd0c 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>                                    new_vec, step_vectype, NULL);
>
>        vec_def = induc_def;
> -      for (i = 1; i < ncopies; i++)
> +      for (i = 1; i < ncopies + 1; i++)
>         {
>           /* vec_i = vec_prev + vec_step  */
>           gimple_seq stmts = NULL;
> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>           vec_def = gimple_convert (&stmts, vectype, vec_def);
>
>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +         if (i < ncopies)
> +           {
> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> +           }
> +         else
> +           {
> +             /* vec_1 = vec_iv + (VF/n * S)
> +                vec_2 = vec_1 + (VF/n * S)
> +                ...
> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> +
> +                vec_n is used as vec_loop to save the large step register and
> +                related operations.  */
> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> +                          UNKNOWN_LOCATION);
> +           }
>         }
>      }
>
> --
> 2.34.1
Jeff Law July 6, 2023, 4:06 p.m. UTC | #2
On 7/6/23 06:44, Richard Biener via Gcc-patches wrote:
> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> Hi,
>>
>> If a loop is unrolled by n times during vectoriation, two steps are used to
>> calculate the induction variable:
>>    - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>    - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>
>> This patch calculates an extra vec_n to replace vec_loop:
>>    vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>
>> So that we can save the large step register and related operations.
> 
> OK.  It would be nice to avoid the dead stmts created earlier though.
> 
> Thanks,
> Richard.
> 
>> gcc/ChangeLog:
>>
>>          PR tree-optimization/110449
>>          * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>          vec_loop for the unrolled loop.
>>
>> gcc/testsuite/ChangeLog:
>>
>>          * gcc.target/aarch64/pr110449.c: New testcase.
I didn't see Hao Liu in the MAINTAINERS file, so probably doesn't have 
write access.  Therefore I went ahead and pushed this for Hao.

jeff
Richard Sandiford July 6, 2023, 5:50 p.m. UTC | #3
Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>> Hi,
>>
>> If a loop is unrolled by n times during vectoriation, two steps are used to
>> calculate the induction variable:
>>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>
>> This patch calculates an extra vec_n to replace vec_loop:
>>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>
>> So that we can save the large step register and related operations.
>
> OK.  It would be nice to avoid the dead stmts created earlier though.

FWIW, I still don't think we should do this.  Part of the point of
unrolling is to shorten loop-carried dependencies, whereas this patch
is going in the opposite direction.

Richard

>
> Thanks,
> Richard.
>
>> gcc/ChangeLog:
>>
>>         PR tree-optimization/110449
>>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>         vec_loop for the unrolled loop.
>>
>> gcc/testsuite/ChangeLog:
>>
>>         * gcc.target/aarch64/pr110449.c: New testcase.
>> ---
>>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>  2 files changed, 58 insertions(+), 3 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>
>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>> new file mode 100644
>> index 00000000000..bb3b6dcfe08
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>> @@ -0,0 +1,40 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>> +
>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>> +
>> +   before (suggested_unroll_factor=2):
>> +         fmov    s30, 8.0e+0
>> +         fmov    s31, 4.0e+0
>> +         dup     v27.4s, v30.s[0]
>> +         dup     v28.4s, v31.s[0]
>> +     .L6:
>> +         mov     v30.16b, v31.16b
>> +         fadd    v31.4s, v31.4s, v27.4s
>> +         fadd    v29.4s, v30.4s, v28.4s
>> +         stp     q30, q29, [x0]
>> +         add     x0, x0, 32
>> +         cmp     x1, x0
>> +         bne     .L6
>> +
>> +   after:
>> +         fmov    s31, 4.0e+0
>> +         dup     v29.4s, v31.s[0]
>> +     .L6:
>> +         fadd    v30.4s, v31.4s, v29.4s
>> +         stp     q31, q30, [x0]
>> +         add     x0, x0, 32
>> +         fadd    v31.4s, v29.4s, v30.4s
>> +         cmp     x0, x1
>> +         bne     .L6  */
>> +
>> +void
>> +foo2 (float *arr, float freq, float step)
>> +{
>> +  for (int i = 0; i < 1024; i++)
>> +    {
>> +      arr[i] = freq;
>> +      freq += step;
>> +    }
>> +}
>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>> index 3b46c58a8d8..706ecbffd0c 100644
>> --- a/gcc/tree-vect-loop.cc
>> +++ b/gcc/tree-vect-loop.cc
>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>                                    new_vec, step_vectype, NULL);
>>
>>        vec_def = induc_def;
>> -      for (i = 1; i < ncopies; i++)
>> +      for (i = 1; i < ncopies + 1; i++)
>>         {
>>           /* vec_i = vec_prev + vec_step  */
>>           gimple_seq stmts = NULL;
>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>           vec_def = gimple_convert (&stmts, vectype, vec_def);
>>
>>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +         if (i < ncopies)
>> +           {
>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>> +           }
>> +         else
>> +           {
>> +             /* vec_1 = vec_iv + (VF/n * S)
>> +                vec_2 = vec_1 + (VF/n * S)
>> +                ...
>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>> +
>> +                vec_n is used as vec_loop to save the large step register and
>> +                related operations.  */
>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>> +                          UNKNOWN_LOCATION);
>> +           }
>>         }
>>      }
>>
>> --
>> 2.34.1
Hao Liu OS July 7, 2023, 2:23 a.m. UTC | #4
Hi Jeff,

Thanks for your help.

Actually I have write access as I was added to the "contributor list". Anyway, that's very kind of you to help committing the patch.

Thanks,
-Hao
Richard Biener July 7, 2023, 5:32 a.m. UTC | #5
> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
> 
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
>>> <gcc-patches@gcc.gnu.org> wrote:
>>> 
>>> Hi,
>>> 
>>> If a loop is unrolled by n times during vectoriation, two steps are used to
>>> calculate the induction variable:
>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>> 
>>> This patch calculates an extra vec_n to replace vec_loop:
>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>> 
>>> So that we can save the large step register and related operations.
>> 
>> OK.  It would be nice to avoid the dead stmts created earlier though.
> 
> FWIW, I still don't think we should do this.  Part of the point of
> unrolling is to shorten loop-carried dependencies, whereas this patch
> is going in the opposite direction.

Note ncopies can be >1 without additional unrolling.  With non VLA vectors all of the updates will be constant folded btw.

Richard 

> Richard
> 
>> 
>> Thanks,
>> Richard.
>> 
>>> gcc/ChangeLog:
>>> 
>>>        PR tree-optimization/110449
>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>>        vec_loop for the unrolled loop.
>>> 
>>> gcc/testsuite/ChangeLog:
>>> 
>>>        * gcc.target/aarch64/pr110449.c: New testcase.
>>> ---
>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>> 2 files changed, 58 insertions(+), 3 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> 
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> new file mode 100644
>>> index 00000000000..bb3b6dcfe08
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>> @@ -0,0 +1,40 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>>> +
>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>>> +
>>> +   before (suggested_unroll_factor=2):
>>> +         fmov    s30, 8.0e+0
>>> +         fmov    s31, 4.0e+0
>>> +         dup     v27.4s, v30.s[0]
>>> +         dup     v28.4s, v31.s[0]
>>> +     .L6:
>>> +         mov     v30.16b, v31.16b
>>> +         fadd    v31.4s, v31.4s, v27.4s
>>> +         fadd    v29.4s, v30.4s, v28.4s
>>> +         stp     q30, q29, [x0]
>>> +         add     x0, x0, 32
>>> +         cmp     x1, x0
>>> +         bne     .L6
>>> +
>>> +   after:
>>> +         fmov    s31, 4.0e+0
>>> +         dup     v29.4s, v31.s[0]
>>> +     .L6:
>>> +         fadd    v30.4s, v31.4s, v29.4s
>>> +         stp     q31, q30, [x0]
>>> +         add     x0, x0, 32
>>> +         fadd    v31.4s, v29.4s, v30.4s
>>> +         cmp     x0, x1
>>> +         bne     .L6  */
>>> +
>>> +void
>>> +foo2 (float *arr, float freq, float step)
>>> +{
>>> +  for (int i = 0; i < 1024; i++)
>>> +    {
>>> +      arr[i] = freq;
>>> +      freq += step;
>>> +    }
>>> +}
>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>>> index 3b46c58a8d8..706ecbffd0c 100644
>>> --- a/gcc/tree-vect-loop.cc
>>> +++ b/gcc/tree-vect-loop.cc
>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>                                   new_vec, step_vectype, NULL);
>>> 
>>>       vec_def = induc_def;
>>> -      for (i = 1; i < ncopies; i++)
>>> +      for (i = 1; i < ncopies + 1; i++)
>>>        {
>>>          /* vec_i = vec_prev + vec_step  */
>>>          gimple_seq stmts = NULL;
>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
>>> 
>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>> +         if (i < ncopies)
>>> +           {
>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>> +           }
>>> +         else
>>> +           {
>>> +             /* vec_1 = vec_iv + (VF/n * S)
>>> +                vec_2 = vec_1 + (VF/n * S)
>>> +                ...
>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>>> +
>>> +                vec_n is used as vec_loop to save the large step register and
>>> +                related operations.  */
>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>>> +                          UNKNOWN_LOCATION);
>>> +           }
>>>        }
>>>     }
>>> 
>>> --
>>> 2.34.1
Richard Sandiford July 7, 2023, 7:53 a.m. UTC | #6
Richard Biener <richard.guenther@gmail.com> writes:
>> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
>> 
>> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
>>>> <gcc-patches@gcc.gnu.org> wrote:
>>>> 
>>>> Hi,
>>>> 
>>>> If a loop is unrolled by n times during vectoriation, two steps are used to
>>>> calculate the induction variable:
>>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
>>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
>>>> 
>>>> This patch calculates an extra vec_n to replace vec_loop:
>>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
>>>> 
>>>> So that we can save the large step register and related operations.
>>> 
>>> OK.  It would be nice to avoid the dead stmts created earlier though.
>> 
>> FWIW, I still don't think we should do this.  Part of the point of
>> unrolling is to shorten loop-carried dependencies, whereas this patch
>> is going in the opposite direction.
>
> Note ncopies can be >1 without additional unrolling.

Yeah, true.  But I think even there, avoiding a longer loop-carried
dependency should be a good thing.

> With non VLA vectors all of the updates will be constant folded btw.

Are you sure?  The motivating example is an Advanced SIMD one,
not a VLA one.  No variable-length vectors are involved.

Maybe constant folding caps the dependency chain to length 2?
But 2 is still more than 1. :)

Thanks,
Richard

>
> Richard 
>
>> Richard
>> 
>>> 
>>> Thanks,
>>> Richard.
>>> 
>>>> gcc/ChangeLog:
>>>> 
>>>>        PR tree-optimization/110449
>>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
>>>>        vec_loop for the unrolled loop.
>>>> 
>>>> gcc/testsuite/ChangeLog:
>>>> 
>>>>        * gcc.target/aarch64/pr110449.c: New testcase.
>>>> ---
>>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
>>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
>>>> 2 files changed, 58 insertions(+), 3 deletions(-)
>>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> 
>>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> new file mode 100644
>>>> index 00000000000..bb3b6dcfe08
>>>> --- /dev/null
>>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
>>>> @@ -0,0 +1,40 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
>>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
>>>> +
>>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
>>>> +
>>>> +   before (suggested_unroll_factor=2):
>>>> +         fmov    s30, 8.0e+0
>>>> +         fmov    s31, 4.0e+0
>>>> +         dup     v27.4s, v30.s[0]
>>>> +         dup     v28.4s, v31.s[0]
>>>> +     .L6:
>>>> +         mov     v30.16b, v31.16b
>>>> +         fadd    v31.4s, v31.4s, v27.4s
>>>> +         fadd    v29.4s, v30.4s, v28.4s
>>>> +         stp     q30, q29, [x0]
>>>> +         add     x0, x0, 32
>>>> +         cmp     x1, x0
>>>> +         bne     .L6
>>>> +
>>>> +   after:
>>>> +         fmov    s31, 4.0e+0
>>>> +         dup     v29.4s, v31.s[0]
>>>> +     .L6:
>>>> +         fadd    v30.4s, v31.4s, v29.4s
>>>> +         stp     q31, q30, [x0]
>>>> +         add     x0, x0, 32
>>>> +         fadd    v31.4s, v29.4s, v30.4s
>>>> +         cmp     x0, x1
>>>> +         bne     .L6  */
>>>> +
>>>> +void
>>>> +foo2 (float *arr, float freq, float step)
>>>> +{
>>>> +  for (int i = 0; i < 1024; i++)
>>>> +    {
>>>> +      arr[i] = freq;
>>>> +      freq += step;
>>>> +    }
>>>> +}
>>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
>>>> index 3b46c58a8d8..706ecbffd0c 100644
>>>> --- a/gcc/tree-vect-loop.cc
>>>> +++ b/gcc/tree-vect-loop.cc
>>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>>                                   new_vec, step_vectype, NULL);
>>>> 
>>>>       vec_def = induc_def;
>>>> -      for (i = 1; i < ncopies; i++)
>>>> +      for (i = 1; i < ncopies + 1; i++)
>>>>        {
>>>>          /* vec_i = vec_prev + vec_step  */
>>>>          gimple_seq stmts = NULL;
>>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
>>>> 
>>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
>>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>>> +         if (i < ncopies)
>>>> +           {
>>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
>>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
>>>> +           }
>>>> +         else
>>>> +           {
>>>> +             /* vec_1 = vec_iv + (VF/n * S)
>>>> +                vec_2 = vec_1 + (VF/n * S)
>>>> +                ...
>>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
>>>> +
>>>> +                vec_n is used as vec_loop to save the large step register and
>>>> +                related operations.  */
>>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
>>>> +                          UNKNOWN_LOCATION);
>>>> +           }
>>>>        }
>>>>     }
>>>> 
>>>> --
>>>> 2.34.1
Richard Biener July 7, 2023, 10:23 a.m. UTC | #7
On Fri, Jul 7, 2023 at 9:53 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener <richard.guenther@gmail.com> writes:
> >> Am 06.07.2023 um 19:50 schrieb Richard Sandiford <richard.sandiford@arm.com>:
> >>
> >> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >>>> On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> >>>> <gcc-patches@gcc.gnu.org> wrote:
> >>>>
> >>>> Hi,
> >>>>
> >>>> If a loop is unrolled by n times during vectoriation, two steps are used to
> >>>> calculate the induction variable:
> >>>>  - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
> >>>>  - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
> >>>>
> >>>> This patch calculates an extra vec_n to replace vec_loop:
> >>>>  vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
> >>>>
> >>>> So that we can save the large step register and related operations.
> >>>
> >>> OK.  It would be nice to avoid the dead stmts created earlier though.
> >>
> >> FWIW, I still don't think we should do this.  Part of the point of
> >> unrolling is to shorten loop-carried dependencies, whereas this patch
> >> is going in the opposite direction.
> >
> > Note ncopies can be >1 without additional unrolling.
>
> Yeah, true.  But I think even there, avoiding a longer loop-carried
> dependency should be a good thing.
>
> > With non VLA vectors all of the updates will be constant folded btw.
>
> Are you sure?  The motivating example is an Advanced SIMD one,
> not a VLA one.  No variable-length vectors are involved.
>
> Maybe constant folding caps the dependency chain to length 2?
> But 2 is still more than 1. :)

The

  /* (A +- CST1) +- CST2 -> A + CST3
     Use view_convert because it is safe for vectors and equivalent for
     scalars.  */
  (for outer_op (plus minus)
   (for inner_op (plus minus)
        neg_inner_op (minus plus)

pattern should apply here for example during forwprop.  It handles
vector constants just fine, so I wonder why it doesn't trigger.

Richard.

> Thanks,
> Richard
>
> >
> > Richard
> >
> >> Richard
> >>
> >>>
> >>> Thanks,
> >>> Richard.
> >>>
> >>>> gcc/ChangeLog:
> >>>>
> >>>>        PR tree-optimization/110449
> >>>>        * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
> >>>>        vec_loop for the unrolled loop.
> >>>>
> >>>> gcc/testsuite/ChangeLog:
> >>>>
> >>>>        * gcc.target/aarch64/pr110449.c: New testcase.
> >>>> ---
> >>>> gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
> >>>> gcc/tree-vect-loop.cc                       | 21 +++++++++--
> >>>> 2 files changed, 58 insertions(+), 3 deletions(-)
> >>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>>
> >>>> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>> new file mode 100644
> >>>> index 00000000000..bb3b6dcfe08
> >>>> --- /dev/null
> >>>> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>>> @@ -0,0 +1,40 @@
> >>>> +/* { dg-do compile } */
> >>>> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
> >>>> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> >>>> +
> >>>> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> >>>> +
> >>>> +   before (suggested_unroll_factor=2):
> >>>> +         fmov    s30, 8.0e+0
> >>>> +         fmov    s31, 4.0e+0
> >>>> +         dup     v27.4s, v30.s[0]
> >>>> +         dup     v28.4s, v31.s[0]
> >>>> +     .L6:
> >>>> +         mov     v30.16b, v31.16b
> >>>> +         fadd    v31.4s, v31.4s, v27.4s
> >>>> +         fadd    v29.4s, v30.4s, v28.4s
> >>>> +         stp     q30, q29, [x0]
> >>>> +         add     x0, x0, 32
> >>>> +         cmp     x1, x0
> >>>> +         bne     .L6
> >>>> +
> >>>> +   after:
> >>>> +         fmov    s31, 4.0e+0
> >>>> +         dup     v29.4s, v31.s[0]
> >>>> +     .L6:
> >>>> +         fadd    v30.4s, v31.4s, v29.4s
> >>>> +         stp     q31, q30, [x0]
> >>>> +         add     x0, x0, 32
> >>>> +         fadd    v31.4s, v29.4s, v30.4s
> >>>> +         cmp     x0, x1
> >>>> +         bne     .L6  */
> >>>> +
> >>>> +void
> >>>> +foo2 (float *arr, float freq, float step)
> >>>> +{
> >>>> +  for (int i = 0; i < 1024; i++)
> >>>> +    {
> >>>> +      arr[i] = freq;
> >>>> +      freq += step;
> >>>> +    }
> >>>> +}
> >>>> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >>>> index 3b46c58a8d8..706ecbffd0c 100644
> >>>> --- a/gcc/tree-vect-loop.cc
> >>>> +++ b/gcc/tree-vect-loop.cc
> >>>> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>>>                                   new_vec, step_vectype, NULL);
> >>>>
> >>>>       vec_def = induc_def;
> >>>> -      for (i = 1; i < ncopies; i++)
> >>>> +      for (i = 1; i < ncopies + 1; i++)
> >>>>        {
> >>>>          /* vec_i = vec_prev + vec_step  */
> >>>>          gimple_seq stmts = NULL;
> >>>> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>>>          vec_def = gimple_convert (&stmts, vectype, vec_def);
> >>>>
> >>>>          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> >>>> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >>>> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>>> +         if (i < ncopies)
> >>>> +           {
> >>>> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >>>> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >>>> +           }
> >>>> +         else
> >>>> +           {
> >>>> +             /* vec_1 = vec_iv + (VF/n * S)
> >>>> +                vec_2 = vec_1 + (VF/n * S)
> >>>> +                ...
> >>>> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> >>>> +
> >>>> +                vec_n is used as vec_loop to save the large step register and
> >>>> +                related operations.  */
> >>>> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> >>>> +                          UNKNOWN_LOCATION);
> >>>> +           }
> >>>>        }
> >>>>     }
> >>>>
> >>>> --
> >>>> 2.34.1
Richard Biener Sept. 5, 2024, 8:22 a.m. UTC | #8
On Thu, Jul 6, 2023 at 7:50 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Biener via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > On Wed, Jul 5, 2023 at 8:44 AM Hao Liu OS via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> Hi,
> >>
> >> If a loop is unrolled by n times during vectoriation, two steps are used to
> >> calculate the induction variable:
> >>   - The small step for the unrolled ith-copy: vec_1 = vec_iv + (VF/n * Step)
> >>   - The large step for the whole loop: vec_loop = vec_iv + (VF * Step)
> >>
> >> This patch calculates an extra vec_n to replace vec_loop:
> >>   vec_n = vec_prev + (VF/n * S) = vec_iv + (VF/n * S) * n = vec_loop.
> >>
> >> So that we can save the large step register and related operations.
> >
> > OK.  It would be nice to avoid the dead stmts created earlier though.
>
> FWIW, I still don't think we should do this.  Part of the point of
> unrolling is to shorten loop-carried dependencies, whereas this patch
> is going in the opposite direction.

Just to note when forcing SLP for the testcase added we're now back
with the shorter dependence (and a testsuite regression).  Either the
SLP path didn't exist or it wasn't updated.

I'll leave the testcase FAILing and will not work to carry over the
"optimization" to the SLP side (I agree that unrolling should avoid
those dependence chains).

Richard.

> Richard
>
> >
> > Thanks,
> > Richard.
> >
> >> gcc/ChangeLog:
> >>
> >>         PR tree-optimization/110449
> >>         * tree-vect-loop.cc (vectorizable_induction): use vec_n to replace
> >>         vec_loop for the unrolled loop.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>         * gcc.target/aarch64/pr110449.c: New testcase.
> >> ---
> >>  gcc/testsuite/gcc.target/aarch64/pr110449.c | 40 +++++++++++++++++++++
> >>  gcc/tree-vect-loop.cc                       | 21 +++++++++--
> >>  2 files changed, 58 insertions(+), 3 deletions(-)
> >>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110449.c
> >>
> >> diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >> new file mode 100644
> >> index 00000000000..bb3b6dcfe08
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
> >> @@ -0,0 +1,40 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
> >> +/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
> >> +
> >> +/* Calcualte the vectorized induction with smaller step for an unrolled loop.
> >> +
> >> +   before (suggested_unroll_factor=2):
> >> +         fmov    s30, 8.0e+0
> >> +         fmov    s31, 4.0e+0
> >> +         dup     v27.4s, v30.s[0]
> >> +         dup     v28.4s, v31.s[0]
> >> +     .L6:
> >> +         mov     v30.16b, v31.16b
> >> +         fadd    v31.4s, v31.4s, v27.4s
> >> +         fadd    v29.4s, v30.4s, v28.4s
> >> +         stp     q30, q29, [x0]
> >> +         add     x0, x0, 32
> >> +         cmp     x1, x0
> >> +         bne     .L6
> >> +
> >> +   after:
> >> +         fmov    s31, 4.0e+0
> >> +         dup     v29.4s, v31.s[0]
> >> +     .L6:
> >> +         fadd    v30.4s, v31.4s, v29.4s
> >> +         stp     q31, q30, [x0]
> >> +         add     x0, x0, 32
> >> +         fadd    v31.4s, v29.4s, v30.4s
> >> +         cmp     x0, x1
> >> +         bne     .L6  */
> >> +
> >> +void
> >> +foo2 (float *arr, float freq, float step)
> >> +{
> >> +  for (int i = 0; i < 1024; i++)
> >> +    {
> >> +      arr[i] = freq;
> >> +      freq += step;
> >> +    }
> >> +}
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 3b46c58a8d8..706ecbffd0c 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -10114,7 +10114,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>                                    new_vec, step_vectype, NULL);
> >>
> >>        vec_def = induc_def;
> >> -      for (i = 1; i < ncopies; i++)
> >> +      for (i = 1; i < ncopies + 1; i++)
> >>         {
> >>           /* vec_i = vec_prev + vec_step  */
> >>           gimple_seq stmts = NULL;
> >> @@ -10124,8 +10124,23 @@ vectorizable_induction (loop_vec_info loop_vinfo,
> >>           vec_def = gimple_convert (&stmts, vectype, vec_def);
> >>
> >>           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
> >> -         new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >> -         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> +         if (i < ncopies)
> >> +           {
> >> +             new_stmt = SSA_NAME_DEF_STMT (vec_def);
> >> +             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
> >> +           }
> >> +         else
> >> +           {
> >> +             /* vec_1 = vec_iv + (VF/n * S)
> >> +                vec_2 = vec_1 + (VF/n * S)
> >> +                ...
> >> +                vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
> >> +
> >> +                vec_n is used as vec_loop to save the large step register and
> >> +                related operations.  */
> >> +             add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
> >> +                          UNKNOWN_LOCATION);
> >> +           }
> >>         }
> >>      }
> >>
> >> --
> >> 2.34.1
diff mbox series

Patch

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110449.c b/gcc/testsuite/gcc.target/aarch64/pr110449.c
new file mode 100644
index 00000000000..bb3b6dcfe08
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110449.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mcpu=neoverse-n2 --param aarch64-vect-unroll-limit=2" } */
+/* { dg-final { scan-assembler-not "8.0e\\+0" } } */
+
+/* Calcualte the vectorized induction with smaller step for an unrolled loop.
+
+   before (suggested_unroll_factor=2):
+	  fmov    s30, 8.0e+0
+	  fmov    s31, 4.0e+0
+	  dup     v27.4s, v30.s[0]
+	  dup     v28.4s, v31.s[0]
+     .L6:
+	  mov     v30.16b, v31.16b
+	  fadd    v31.4s, v31.4s, v27.4s
+	  fadd    v29.4s, v30.4s, v28.4s
+	  stp     q30, q29, [x0]
+	  add     x0, x0, 32
+	  cmp     x1, x0
+	  bne     .L6
+
+   after:
+	  fmov    s31, 4.0e+0
+	  dup     v29.4s, v31.s[0]
+     .L6:
+	  fadd    v30.4s, v31.4s, v29.4s
+	  stp     q31, q30, [x0]
+	  add     x0, x0, 32
+	  fadd    v31.4s, v29.4s, v30.4s
+	  cmp     x0, x1
+	  bne     .L6  */
+
+void
+foo2 (float *arr, float freq, float step)
+{
+  for (int i = 0; i < 1024; i++)
+    {
+      arr[i] = freq;
+      freq += step;
+    }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3b46c58a8d8..706ecbffd0c 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10114,7 +10114,7 @@  vectorizable_induction (loop_vec_info loop_vinfo,
 				   new_vec, step_vectype, NULL);
 
       vec_def = induc_def;
-      for (i = 1; i < ncopies; i++)
+      for (i = 1; i < ncopies + 1; i++)
 	{
 	  /* vec_i = vec_prev + vec_step  */
 	  gimple_seq stmts = NULL;
@@ -10124,8 +10124,23 @@  vectorizable_induction (loop_vec_info loop_vinfo,
 	  vec_def = gimple_convert (&stmts, vectype, vec_def);
  
 	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
-	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
-	  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	  if (i < ncopies)
+	    {
+	      new_stmt = SSA_NAME_DEF_STMT (vec_def);
+	      STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	    }
+	  else
+	    {
+	      /* vec_1 = vec_iv + (VF/n * S)
+		 vec_2 = vec_1 + (VF/n * S)
+		 ...
+		 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
+
+		 vec_n is used as vec_loop to save the large step register and
+		 related operations.  */
+	      add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+			   UNKNOWN_LOCATION);
+	    }
 	}
     }