diff mbox series

[1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328]

Message ID 20240912005129.26758-1-quic_pzheng@quicinc.com
State New
Headers show
Series [1/2] aarch64: Improve vector constant generation using SVE INDEX instruction [PR113328] | expand

Commit Message

Pengxuan Zheng Sept. 12, 2024, 12:51 a.m. UTC
SVE's INDEX instruction can be used to populate vectors by values starting from
"base" and incremented by "step" for each subsequent value. We can take
advantage of it to generate vector constants if TARGET_SVE is available and the
base and step values are within [-16, 15].

For example, with the following function:

typedef int v4si __attribute__ ((vector_size (16)));
v4si
f_v4si (void)
{
  return (v4si){ 0, 1, 2, 3 };
}

GCC currently generates:

f_v4si:
	adrp    x0, .LC4
	ldr     q0, [x0, #:lo12:.LC4]
	ret

.LC4:
	.word   0
	.word   1
	.word   2
	.word   3

With this patch, we generate an INDEX instruction instead if TARGET_SVE is
available.

f_v4si:
	index   z0.s, #0, #1
	ret

	PR target/113328

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
	handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is
	available.
	(aarch64_output_simd_mov_immediate): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
	SVE's INDEX instruction.
	* gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
	* gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
	* gcc.target/aarch64/sve/vec_init_3.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
---
 gcc/config/aarch64/aarch64.cc                 | 12 ++-
 .../aarch64/sve/acle/general/dupq_1.c         |  3 +-
 .../aarch64/sve/acle/general/dupq_2.c         |  3 +-
 .../aarch64/sve/acle/general/dupq_3.c         |  3 +-
 .../aarch64/sve/acle/general/dupq_4.c         |  3 +-
 .../gcc.target/aarch64/sve/vec_init_3.c       | 99 +++++++++++++++++++
 6 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c

Comments

Richard Biener Sept. 12, 2024, 6:46 a.m. UTC | #1
On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng <quic_pzheng@quicinc.com> wrote:
>
> SVE's INDEX instruction can be used to populate vectors by values starting from
> "base" and incremented by "step" for each subsequent value. We can take
> advantage of it to generate vector constants if TARGET_SVE is available and the
> base and step values are within [-16, 15].

Are there multiplication by or addition of scalar immediate instructions to
enhance this with two-instruction sequences?

> For example, with the following function:
>
> typedef int v4si __attribute__ ((vector_size (16)));
> v4si
> f_v4si (void)
> {
>   return (v4si){ 0, 1, 2, 3 };
> }
>
> GCC currently generates:
>
> f_v4si:
>         adrp    x0, .LC4
>         ldr     q0, [x0, #:lo12:.LC4]
>         ret
>
> .LC4:
>         .word   0
>         .word   1
>         .word   2
>         .word   3
>
> With this patch, we generate an INDEX instruction instead if TARGET_SVE is
> available.
>
> f_v4si:
>         index   z0.s, #0, #1
>         ret
>
>         PR target/113328
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate): Improve
>         handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE is
>         available.
>         (aarch64_output_simd_mov_immediate): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
>         SVE's INDEX instruction.
>         * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
>         * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
>         * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
>         * gcc.target/aarch64/sve/vec_init_3.c: New test.
>
> Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.cc                 | 12 ++-
>  .../aarch64/sve/acle/general/dupq_1.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_2.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_3.c         |  3 +-
>  .../aarch64/sve/acle/general/dupq_4.c         |  3 +-
>  .../gcc.target/aarch64/sve/vec_init_3.c       | 99 +++++++++++++++++++
>  6 files changed, 114 insertions(+), 9 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 27e24ba70ab..6b3ca57d0eb 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
>    if (CONST_VECTOR_P (op)
>        && CONST_VECTOR_DUPLICATE_P (op))
>      n_elts = CONST_VECTOR_NPATTERNS (op);
> -  else if ((vec_flags & VEC_SVE_DATA)
> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
>            && const_vec_series_p (op, &base, &step))
>      {
>        gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
> @@ -25249,6 +25249,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
>
>    if (which == AARCH64_CHECK_MOV)
>      {
> +      if (info.insn == simd_immediate_info::INDEX)
> +       {
> +         gcc_assert (TARGET_SVE);
> +         snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
> +                   HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
> +                   element_char, INTVAL (info.u.index.base),
> +                   INTVAL (info.u.index.step));
> +         return templ;
> +       }
> +
>        mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
>        shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
>                   ? "msl" : "lsl");
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> index 216699b0536..0940bedd0dd 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (x, 1, 2, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> index d494943a275..218a6601337 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (x, 1, 2, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> index 4bc8259df07..245d43b75b5 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (0, 1, x, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> index 6f9f9f2f22f..cbee6f27b62 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> @@ -10,7 +10,6 @@ dupq (int x)
>    return svdupq_s32 (0, 1, x, 3);
>  }
>
> -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
>  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
>  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
> -/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> new file mode 100644
> index 00000000000..25910dbfa1f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +typedef char v16qi __attribute__ ((vector_size (16)));
> +typedef char v8qi __attribute__ ((vector_size (8)));
> +typedef short v8hi __attribute__ ((vector_size (16)));
> +typedef short v4hi __attribute__ ((vector_size (8)));
> +typedef int v4si __attribute__ ((vector_size (16)));
> +typedef int v2si __attribute__ ((vector_size (8)));
> +typedef long v2di __attribute__ ((vector_size (16)));
> +
> +/*
> +** f_v16qi:
> +**     index   z0\.b, #0, #1
> +**     ret
> +*/
> +v16qi
> +f_v16qi (void)
> +{
> +  return (v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
> +}
> +
> +/*
> +** f_v8qi:
> +**     index   z0\.b, #0, #1
> +**     ret
> +*/
> +v8qi
> +f_v8qi (void)
> +{
> +  return (v8qi){ 0, 1, 2, 3, 4, 5, 6, 7 };
> +}
> +
> +/*
> +** f_v8hi:
> +**     index   z0\.h, #0, #1
> +**     ret
> +*/
> +v8hi
> +f_v8hi (void)
> +{
> +  return (v8hi){ 0, 1, 2, 3, 4, 5, 6, 7 };
> +}
> +
> +/*
> +** f_v4hi:
> +**     index   z0\.h, #0, #1
> +**     ret
> +*/
> +v4hi
> +f_v4hi (void)
> +{
> +  return (v4hi){ 0, 1, 2, 3 };
> +}
> +
> +/*
> +** f_v4si:
> +**     index   z0\.s, #0, #1
> +**     ret
> +*/
> +v4si
> +f_v4si (void)
> +{
> +  return (v4si){ 0, 1, 2, 3 };
> +}
> +
> +/*
> +** f_v2si:
> +**     index   z0\.s, #0, #1
> +**     ret
> +*/
> +v2si
> +f_v2si (void)
> +{
> +  return (v2si){ 0, 1 };
> +}
> +
> +/*
> +** f_v2di:
> +**     index   z0\.d, #0, #1
> +**     ret
> +*/
> +v2di
> +f_v2di (void)
> +{
> +  return (v2di){ 0, 1 };
> +}
> +
> +/*
> +** g_v4si:
> +**     index   z0\.s, #3, #-4
> +**     ret
> +*/
> +v4si
> +g_v4si (void)
> +{
> +  return (v4si){ 3, -1, -5, -9 };
> +}
> --
> 2.17.1
>
Pengxuan Zheng Sept. 12, 2024, 9:33 p.m. UTC | #2
> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> <quic_pzheng@quicinc.com> wrote:
> >
> > SVE's INDEX instruction can be used to populate vectors by values
> > starting from "base" and incremented by "step" for each subsequent
> > value. We can take advantage of it to generate vector constants if
> > TARGET_SVE is available and the base and step values are within [-16, 15].
> 
> Are there multiplication by or addition of scalar immediate instructions to
> enhance this with two-instruction sequences?

No, Richard, I can't think of any equivalent two-instruction sequences.

Thanks,
Pengxuan
> 
> > For example, with the following function:
> >
> > typedef int v4si __attribute__ ((vector_size (16))); v4si f_v4si
> > (void) {
> >   return (v4si){ 0, 1, 2, 3 };
> > }
> >
> > GCC currently generates:
> >
> > f_v4si:
> >         adrp    x0, .LC4
> >         ldr     q0, [x0, #:lo12:.LC4]
> >         ret
> >
> > .LC4:
> >         .word   0
> >         .word   1
> >         .word   2
> >         .word   3
> >
> > With this patch, we generate an INDEX instruction instead if
> > TARGET_SVE is available.
> >
> > f_v4si:
> >         index   z0.s, #0, #1
> >         ret
> >
> >         PR target/113328
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64.cc (aarch64_simd_valid_immediate):
> Improve
> >         handling of some ADVSIMD vectors by using SVE's INDEX if TARGET_SVE
> is
> >         available.
> >         (aarch64_output_simd_mov_immediate): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/sve/acle/general/dupq_1.c: Update test to use
> >         SVE's INDEX instruction.
> >         * gcc.target/aarch64/sve/acle/general/dupq_2.c: Likewise.
> >         * gcc.target/aarch64/sve/acle/general/dupq_3.c: Likewise.
> >         * gcc.target/aarch64/sve/acle/general/dupq_4.c: Likewise.
> >         * gcc.target/aarch64/sve/vec_init_3.c: New test.
> >
> > Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64.cc                 | 12 ++-
> >  .../aarch64/sve/acle/general/dupq_1.c         |  3 +-
> >  .../aarch64/sve/acle/general/dupq_2.c         |  3 +-
> >  .../aarch64/sve/acle/general/dupq_3.c         |  3 +-
> >  .../aarch64/sve/acle/general/dupq_4.c         |  3 +-
> >  .../gcc.target/aarch64/sve/vec_init_3.c       | 99 +++++++++++++++++++
> >  6 files changed, 114 insertions(+), 9 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> >
> > diff --git a/gcc/config/aarch64/aarch64.cc
> > b/gcc/config/aarch64/aarch64.cc index 27e24ba70ab..6b3ca57d0eb 100644
> > --- a/gcc/config/aarch64/aarch64.cc
> > +++ b/gcc/config/aarch64/aarch64.cc
> > @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> simd_immediate_info *info,
> >    if (CONST_VECTOR_P (op)
> >        && CONST_VECTOR_DUPLICATE_P (op))
> >      n_elts = CONST_VECTOR_NPATTERNS (op);
> > -  else if ((vec_flags & VEC_SVE_DATA)
> > +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >            && const_vec_series_p (op, &base, &step))
> >      {
> >        gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); @@
> > -25249,6 +25249,16 @@ aarch64_output_simd_mov_immediate (rtx
> > const_vector, unsigned width,
> >
> >    if (which == AARCH64_CHECK_MOV)
> >      {
> > +      if (info.insn == simd_immediate_info::INDEX)
> > +       {
> > +         gcc_assert (TARGET_SVE);
> > +         snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
> > +                   HOST_WIDE_INT_PRINT_DEC ", #"
> HOST_WIDE_INT_PRINT_DEC,
> > +                   element_char, INTVAL (info.u.index.base),
> > +                   INTVAL (info.u.index.step));
> > +         return templ;
> > +       }
> > +
> >        mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
> >        shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
> >                   ? "msl" : "lsl");
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > index 216699b0536..0940bedd0dd 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >    return svdupq_s32 (x, 1, 2, 3);
> >  }
> >
> > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
> >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
> >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n}
> > } } */
> > -/* { dg-final { scan-assembler
> > {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */ diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> > index d494943a275..218a6601337 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >    return svdupq_s32 (x, 1, 2, 3);
> >  }
> >
> > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
> >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
> >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n}
> > } } */
> > -/* { dg-final { scan-assembler
> > {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */ diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> > index 4bc8259df07..245d43b75b5 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >    return svdupq_s32 (0, 1, x, 3);
> >  }
> >
> > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
> >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
> >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n}
> > } } */
> > -/* { dg-final { scan-assembler
> > {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> > index 6f9f9f2f22f..cbee6f27b62 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
> > @@ -10,7 +10,6 @@ dupq (int x)
> >    return svdupq_s32 (0, 1, x, 3);
> >  }
> >
> > -/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
> > +/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
> >  /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
> >  /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n}
> > } } */
> > -/* { dg-final { scan-assembler
> > {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> > new file mode 100644
> > index 00000000000..25910dbfa1f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
> > @@ -0,0 +1,99 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +typedef char v16qi __attribute__ ((vector_size (16))); typedef char
> > +v8qi __attribute__ ((vector_size (8))); typedef short v8hi
> > +__attribute__ ((vector_size (16))); typedef short v4hi __attribute__
> > +((vector_size (8))); typedef int v4si __attribute__ ((vector_size
> > +(16))); typedef int v2si __attribute__ ((vector_size (8))); typedef
> > +long v2di __attribute__ ((vector_size (16)));
> > +
> > +/*
> > +** f_v16qi:
> > +**     index   z0\.b, #0, #1
> > +**     ret
> > +*/
> > +v16qi
> > +f_v16qi (void)
> > +{
> > +  return (v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
> > +15 }; }
> > +
> > +/*
> > +** f_v8qi:
> > +**     index   z0\.b, #0, #1
> > +**     ret
> > +*/
> > +v8qi
> > +f_v8qi (void)
> > +{
> > +  return (v8qi){ 0, 1, 2, 3, 4, 5, 6, 7 }; }
> > +
> > +/*
> > +** f_v8hi:
> > +**     index   z0\.h, #0, #1
> > +**     ret
> > +*/
> > +v8hi
> > +f_v8hi (void)
> > +{
> > +  return (v8hi){ 0, 1, 2, 3, 4, 5, 6, 7 }; }
> > +
> > +/*
> > +** f_v4hi:
> > +**     index   z0\.h, #0, #1
> > +**     ret
> > +*/
> > +v4hi
> > +f_v4hi (void)
> > +{
> > +  return (v4hi){ 0, 1, 2, 3 };
> > +}
> > +
> > +/*
> > +** f_v4si:
> > +**     index   z0\.s, #0, #1
> > +**     ret
> > +*/
> > +v4si
> > +f_v4si (void)
> > +{
> > +  return (v4si){ 0, 1, 2, 3 };
> > +}
> > +
> > +/*
> > +** f_v2si:
> > +**     index   z0\.s, #0, #1
> > +**     ret
> > +*/
> > +v2si
> > +f_v2si (void)
> > +{
> > +  return (v2si){ 0, 1 };
> > +}
> > +
> > +/*
> > +** f_v2di:
> > +**     index   z0\.d, #0, #1
> > +**     ret
> > +*/
> > +v2di
> > +f_v2di (void)
> > +{
> > +  return (v2di){ 0, 1 };
> > +}
> > +
> > +/*
> > +** g_v4si:
> > +**     index   z0\.s, #3, #-4
> > +**     ret
> > +*/
> > +v4si
> > +g_v4si (void)
> > +{
> > +  return (v4si){ 3, -1, -5, -9 };
> > +}
> > --
> > 2.17.1
> >
Richard Sandiford Sept. 16, 2024, 2:32 p.m. UTC | #3
"Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
>> <quic_pzheng@quicinc.com> wrote:
>> >
>> > SVE's INDEX instruction can be used to populate vectors by values
>> > starting from "base" and incremented by "step" for each subsequent
>> > value. We can take advantage of it to generate vector constants if
>> > TARGET_SVE is available and the base and step values are within [-16, 15].
>> 
>> Are there multiplication by or addition of scalar immediate instructions to
>> enhance this with two-instruction sequences?
>
> No, Richard, I can't think of any equivalent two-instruction sequences.

There are some.  E.g.:

     { 16, 17, 18, 19, ... }

could be:

	index	z0.b, #0, #1
	add	z0.b, z0.b, #16

or, alternatively:

	mov	w0, #16
	index	z0.b, w0, #1

But these cases are less obviously a win, so I think it's ok to handle
single instructions only for now.

The patch is ok for trunk, thanks, but:

>> > @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
>> simd_immediate_info *info,
>> >    if (CONST_VECTOR_P (op)
>> >        && CONST_VECTOR_DUPLICATE_P (op))
>> >      n_elts = CONST_VECTOR_NPATTERNS (op);
>> > -  else if ((vec_flags & VEC_SVE_DATA)
>> > +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
>> >            && const_vec_series_p (op, &base, &step))

...the convention is to have one && condition per line if the whole
expression doesn't fit on a single line:

  else if (which == AARCH64_CHECK_MOV
           && TARGET_SVE
           && const_vec_series_p (op, &base, &step))

Richard
Pengxuan Zheng Sept. 16, 2024, 6:08 p.m. UTC | #4
> "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
> >> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> >> <quic_pzheng@quicinc.com> wrote:
> >> >
> >> > SVE's INDEX instruction can be used to populate vectors by values
> >> > starting from "base" and incremented by "step" for each subsequent
> >> > value. We can take advantage of it to generate vector constants if
> >> > TARGET_SVE is available and the base and step values are within [-16,
> 15].
> >>
> >> Are there multiplication by or addition of scalar immediate
> >> instructions to enhance this with two-instruction sequences?
> >
> > No, Richard, I can't think of any equivalent two-instruction sequences.
> 
> There are some.  E.g.:
> 
>      { 16, 17, 18, 19, ... }
> 
> could be:
> 
> 	index	z0.b, #0, #1
> 	add	z0.b, z0.b, #16
> 
> or, alternatively:
> 
> 	mov	w0, #16
> 	index	z0.b, w0, #1
> 
> But these cases are less obviously a win, so I think it's ok to handle single
> instructions only for now.
> 
> The patch is ok for trunk, thanks, but:
> 
> >> > @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> >> simd_immediate_info *info,
> >> >    if (CONST_VECTOR_P (op)
> >> >        && CONST_VECTOR_DUPLICATE_P (op))
> >> >      n_elts = CONST_VECTOR_NPATTERNS (op);
> >> > -  else if ((vec_flags & VEC_SVE_DATA)
> >> > +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >> >            && const_vec_series_p (op, &base, &step))
> 
> ...the convention is to have one && condition per line if the whole expression
> doesn't fit on a single line:
> 
>   else if (which == AARCH64_CHECK_MOV
>            && TARGET_SVE
>            && const_vec_series_p (op, &base, &step))
> 

Thanks for pointing this out, Richard! I've fixed this and pushed the patch as r15-3669-ga92f54f580c377.

Thanks,
Pengxuan
> Richard
Kyrylo Tkachov Sept. 17, 2024, 7:57 a.m. UTC | #5
> On 16 Sep 2024, at 16:32, Richard Sandiford <richard.sandiford@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
>>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
>>> <quic_pzheng@quicinc.com> wrote:
>>>> 
>>>> SVE's INDEX instruction can be used to populate vectors by values
>>>> starting from "base" and incremented by "step" for each subsequent
>>>> value. We can take advantage of it to generate vector constants if
>>>> TARGET_SVE is available and the base and step values are within [-16, 15].
>>> 
>>> Are there multiplication by or addition of scalar immediate instructions to
>>> enhance this with two-instruction sequences?
>> 
>> No, Richard, I can't think of any equivalent two-instruction sequences.
> 
> There are some.  E.g.:
> 
>     { 16, 17, 18, 19, ... }
> 
> could be:
> 
>        index   z0.b, #0, #1
>        add     z0.b, z0.b, #16
> 
> or, alternatively:
> 
>        mov     w0, #16
>        index   z0.b, w0, #1
> 
> But these cases are less obviously a win, so I think it's ok to handle
> single instructions only for now.

(Not related to this patch, this work is great, thanks Pengxuan!)
Looking at some SWOGs like for Neoverse V2 it looks like the first sequence is preferable.
On that core the INDEX-immediates-only operation has latency 4 and throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that core.
But in the second sequence the INDEX-reg-operand has latency 7 and throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
We may encounter a situation in the future where we’ll want to optimize the second sequence (if it comes from intrinsics code for example) into the first.
Thanks,
Kyrill


> 
> The patch is ok for trunk, thanks, but:
> 
>>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
>>> simd_immediate_info *info,
>>>>   if (CONST_VECTOR_P (op)
>>>>       && CONST_VECTOR_DUPLICATE_P (op))
>>>>     n_elts = CONST_VECTOR_NPATTERNS (op);
>>>> -  else if ((vec_flags & VEC_SVE_DATA)
>>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
>>>>           && const_vec_series_p (op, &base, &step))
> 
> ...the convention is to have one && condition per line if the whole
> expression doesn't fit on a single line:
> 
>  else if (which == AARCH64_CHECK_MOV
>           && TARGET_SVE
>           && const_vec_series_p (op, &base, &step))
> 
> Richard
Richard Biener Sept. 17, 2024, 8:52 a.m. UTC | #6
On Tue, Sep 17, 2024 at 9:57 AM Kyrylo Tkachov <ktkachov@nvidia.com> wrote:
>
>
>
> > On 16 Sep 2024, at 16:32, Richard Sandiford <richard.sandiford@arm.com> wrote:
> >
> > External email: Use caution opening links or attachments
> >
> >
> > "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
> >>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> >>> <quic_pzheng@quicinc.com> wrote:
> >>>>
> >>>> SVE's INDEX instruction can be used to populate vectors by values
> >>>> starting from "base" and incremented by "step" for each subsequent
> >>>> value. We can take advantage of it to generate vector constants if
> >>>> TARGET_SVE is available and the base and step values are within [-16, 15].
> >>>
> >>> Are there multiplication by or addition of scalar immediate instructions to
> >>> enhance this with two-instruction sequences?
> >>
> >> No, Richard, I can't think of any equivalent two-instruction sequences.
> >
> > There are some.  E.g.:
> >
> >     { 16, 17, 18, 19, ... }
> >
> > could be:
> >
> >        index   z0.b, #0, #1
> >        add     z0.b, z0.b, #16
> >
> > or, alternatively:
> >
> >        mov     w0, #16
> >        index   z0.b, w0, #1

I guess even step between [16, 31] could be handed with index with half
step and then adding the result to itself (multiply by immediate #2), even
if there's no direct vector-by-immediate instruction available.  Likewise
of course  some { A0 + n * B1 + n * B2, ... } can be handled by adding
two index compute results.

> > But these cases are less obviously a win, so I think it's ok to handle
> > single instructions only for now.
>
> (Not related to this patch, this work is great, thanks Pengxuan!)
> Looking at some SWOGs like for Neoverse V2 it looks like the first sequence is preferable.
> On that core the INDEX-immediates-only operation has latency 4 and throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that core.
> But in the second sequence the INDEX-reg-operand has latency 7 and throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.

So what's the latency/throughput of a vector load from constant pool
(can we even have a "SVE" constant pool?  I assume
entries would have to be of the architecturally largest vector size?),
assuming it's in L1 (where it would occupy quite some
space eventually).

Richard.

> We may encounter a situation in the future where we’ll want to optimize the second sequence (if it comes from intrinsics code for example) into the first.
> Thanks,
> Kyrill
>
>
> >
> > The patch is ok for trunk, thanks, but:
> >
> >>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> >>> simd_immediate_info *info,
> >>>>   if (CONST_VECTOR_P (op)
> >>>>       && CONST_VECTOR_DUPLICATE_P (op))
> >>>>     n_elts = CONST_VECTOR_NPATTERNS (op);
> >>>> -  else if ((vec_flags & VEC_SVE_DATA)
> >>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >>>>           && const_vec_series_p (op, &base, &step))
> >
> > ...the convention is to have one && condition per line if the whole
> > expression doesn't fit on a single line:
> >
> >  else if (which == AARCH64_CHECK_MOV
> >           && TARGET_SVE
> >           && const_vec_series_p (op, &base, &step))
> >
> > Richard
>
Kyrylo Tkachov Sept. 17, 2024, 9:03 a.m. UTC | #7
> On 17 Sep 2024, at 10:52, Richard Biener <richard.guenther@gmail.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Tue, Sep 17, 2024 at 9:57 AM Kyrylo Tkachov <ktkachov@nvidia.com> wrote:
>> 
>> 
>> 
>>> On 16 Sep 2024, at 16:32, Richard Sandiford <richard.sandiford@arm.com> wrote:
>>> 
>>> External email: Use caution opening links or attachments
>>> 
>>> 
>>> "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
>>>>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
>>>>> <quic_pzheng@quicinc.com> wrote:
>>>>>> 
>>>>>> SVE's INDEX instruction can be used to populate vectors by values
>>>>>> starting from "base" and incremented by "step" for each subsequent
>>>>>> value. We can take advantage of it to generate vector constants if
>>>>>> TARGET_SVE is available and the base and step values are within [-16, 15].
>>>>> 
>>>>> Are there multiplication by or addition of scalar immediate instructions to
>>>>> enhance this with two-instruction sequences?
>>>> 
>>>> No, Richard, I can't think of any equivalent two-instruction sequences.
>>> 
>>> There are some.  E.g.:
>>> 
>>>    { 16, 17, 18, 19, ... }
>>> 
>>> could be:
>>> 
>>>       index   z0.b, #0, #1
>>>       add     z0.b, z0.b, #16
>>> 
>>> or, alternatively:
>>> 
>>>       mov     w0, #16
>>>       index   z0.b, w0, #1
> 
> I guess even step between [16, 31] could be handed with index with half
> step and then adding the result to itself (multiply by immediate #2), even
> if there's no direct vector-by-immediate instruction available.  Likewise
> of course  some { A0 + n * B1 + n * B2, ... } can be handled by adding
> two index compute results.

There are some such by-immediate instructions in SVE that we could try, but each one would need to be carefully evaluated as their latencies and throughputs may vary.


> 
>>> But these cases are less obviously a win, so I think it's ok to handle
>>> single instructions only for now.
>> 
>> (Not related to this patch, this work is great, thanks Pengxuan!)
>> Looking at some SWOGs like for Neoverse V2 it looks like the first sequence is preferable.
>> On that core the INDEX-immediates-only operation has latency 4 and throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that core.
>> But in the second sequence the INDEX-reg-operand has latency 7 and throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
> 
> So what's the latency/throughput of a vector load from constant pool
> (can we even have a "SVE" constant pool?  I assume
> entries would have to be of the architecturally largest vector size?),
> assuming it's in L1 (where it would occupy quite some
> space eventually).

In this thread we’re talking about implementing fixed-length 128-bit “Neon”/GCC vector extension operations with SVE instructions rather than VLA SVE constants as SVE has some useful instructions that, applied to the bottom 128 bits can do things that plain Neon can’t. So the constant-pool alternative is a simple Neon address generation+load.
I haven’t thought through the SVE constant creation story yet.
From what I can tell the vector load of a Neon register has a latency of 6 or 7 cycles (throughput 3) and the ADRP for address generation is very fast (latency/throughput: 1/4)
Thanks,
Kyrill

> 
> Richard.
> 
>> We may encounter a situation in the future where we’ll want to optimize the second sequence (if it comes from intrinsics code for example) into the first.
>> Thanks,
>> Kyrill
>> 
>> 
>>> 
>>> The patch is ok for trunk, thanks, but:
>>> 
>>>>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
>>>>> simd_immediate_info *info,
>>>>>>  if (CONST_VECTOR_P (op)
>>>>>>      && CONST_VECTOR_DUPLICATE_P (op))
>>>>>>    n_elts = CONST_VECTOR_NPATTERNS (op);
>>>>>> -  else if ((vec_flags & VEC_SVE_DATA)
>>>>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
>>>>>>          && const_vec_series_p (op, &base, &step))
>>> 
>>> ...the convention is to have one && condition per line if the whole
>>> expression doesn't fit on a single line:
>>> 
>>> else if (which == AARCH64_CHECK_MOV
>>>          && TARGET_SVE
>>>          && const_vec_series_p (op, &base, &step))
>>> 
>>> Richard
>>
Pengxuan Zheng Sept. 17, 2024, 4:45 p.m. UTC | #8
> > On 16 Sep 2024, at 16:32, Richard Sandiford <richard.sandiford@arm.com>
> wrote:
> >
> > External email: Use caution opening links or attachments
> >
> >
> > "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
> >>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> >>> <quic_pzheng@quicinc.com> wrote:
> >>>>
> >>>> SVE's INDEX instruction can be used to populate vectors by values
> >>>> starting from "base" and incremented by "step" for each subsequent
> >>>> value. We can take advantage of it to generate vector constants if
> >>>> TARGET_SVE is available and the base and step values are within [-16,
> 15].
> >>>
> >>> Are there multiplication by or addition of scalar immediate
> >>> instructions to enhance this with two-instruction sequences?
> >>
> >> No, Richard, I can't think of any equivalent two-instruction sequences.
> >
> > There are some.  E.g.:
> >
> >     { 16, 17, 18, 19, ... }
> >
> > could be:
> >
> >        index   z0.b, #0, #1
> >        add     z0.b, z0.b, #16
> >
> > or, alternatively:
> >
> >        mov     w0, #16
> >        index   z0.b, w0, #1
> >
> > But these cases are less obviously a win, so I think it's ok to handle
> > single instructions only for now.
> 
> (Not related to this patch, this work is great, thanks Pengxuan!) Looking at
> some SWOGs like for Neoverse V2 it looks like the first sequence is preferable.
> On that core the INDEX-immediates-only operation has latency 4 and
> throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that
> core.
> But in the second sequence the INDEX-reg-operand has latency 7 and
> throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
> We may encounter a situation in the future where we’ll want to optimize the
> second sequence (if it comes from intrinsics code for example) into the first.

This does look like something that we may want to consider improving in the future. Thanks for bringing it up and elaborating on it, Kyrylo!

Thanks,
Pengxuan

> Thanks,
> Kyrill
> 
> 
> >
> > The patch is ok for trunk, thanks, but:
> >
> >>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> >>> simd_immediate_info *info,
> >>>>   if (CONST_VECTOR_P (op)
> >>>>       && CONST_VECTOR_DUPLICATE_P (op))
> >>>>     n_elts = CONST_VECTOR_NPATTERNS (op);
> >>>> -  else if ((vec_flags & VEC_SVE_DATA)
> >>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> >>>>           && const_vec_series_p (op, &base, &step))
> >
> > ...the convention is to have one && condition per line if the whole
> > expression doesn't fit on a single line:
> >
> >  else if (which == AARCH64_CHECK_MOV
> >           && TARGET_SVE
> >           && const_vec_series_p (op, &base, &step))
> >
> > Richard
Pengxuan Zheng Sept. 17, 2024, 4:48 p.m. UTC | #9
> > > On 16 Sep 2024, at 16:32, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> > >
> > > External email: Use caution opening links or attachments
> > >
> > >
> > > "Pengxuan Zheng (QUIC)" <quic_pzheng@quicinc.com> writes:
> > >>> On Thu, Sep 12, 2024 at 2:53 AM Pengxuan Zheng
> > >>> <quic_pzheng@quicinc.com> wrote:
> > >>>>
> > >>>> SVE's INDEX instruction can be used to populate vectors by values
> > >>>> starting from "base" and incremented by "step" for each
> > >>>> subsequent value. We can take advantage of it to generate vector
> > >>>> constants if TARGET_SVE is available and the base and step values are
> within [-16, 15].
> > >>>
> > >>> Are there multiplication by or addition of scalar immediate
> > >>> instructions to enhance this with two-instruction sequences?
> > >>
> > >> No, Richard, I can't think of any equivalent two-instruction sequences.
> > >
> > > There are some.  E.g.:
> > >
> > >     { 16, 17, 18, 19, ... }
> > >
> > > could be:
> > >
> > >        index   z0.b, #0, #1
> > >        add     z0.b, z0.b, #16
> > >
> > > or, alternatively:
> > >
> > >        mov     w0, #16
> > >        index   z0.b, w0, #1
> 
> I guess even step between [16, 31] could be handed with index with half step
> and then adding the result to itself (multiply by immediate #2), even if there's
> no direct vector-by-immediate instruction available.  Likewise of course  some
> { A0 + n * B1 + n * B2, ... } can be handled by adding two index compute
> results.

Thanks for the example, Richard! It does seem to be something worth looking into.

Thanks,
Pengxuan
> 
> > > But these cases are less obviously a win, so I think it's ok to
> > > handle single instructions only for now.
> >
> > (Not related to this patch, this work is great, thanks Pengxuan!)
> > Looking at some SWOGs like for Neoverse V2 it looks like the first sequence
> is preferable.
> > On that core the INDEX-immediates-only operation has latency 4 and
> throughput 2 and the SVE ADD is as cheap as SIMD operations can be on that
> core.
> > But in the second sequence the INDEX-reg-operand has latency 7 and
> throughput 1 as it seems to treat it as a GP <-> SIMD transfer of some sort.
> 
> So what's the latency/throughput of a vector load from constant pool (can we
> even have a "SVE" constant pool?  I assume entries would have to be of the
> architecturally largest vector size?), assuming it's in L1 (where it would occupy
> quite some space eventually).
> 
> Richard.
> 
> > We may encounter a situation in the future where we’ll want to optimize the
> second sequence (if it comes from intrinsics code for example) into the first.
> > Thanks,
> > Kyrill
> >
> >
> > >
> > > The patch is ok for trunk, thanks, but:
> > >
> > >>>> @@ -22991,7 +22991,7 @@ aarch64_simd_valid_immediate (rtx op,
> > >>> simd_immediate_info *info,
> > >>>>   if (CONST_VECTOR_P (op)
> > >>>>       && CONST_VECTOR_DUPLICATE_P (op))
> > >>>>     n_elts = CONST_VECTOR_NPATTERNS (op);
> > >>>> -  else if ((vec_flags & VEC_SVE_DATA)
> > >>>> +  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
> > >>>>           && const_vec_series_p (op, &base, &step))
> > >
> > > ...the convention is to have one && condition per line if the whole
> > > expression doesn't fit on a single line:
> > >
> > >  else if (which == AARCH64_CHECK_MOV
> > >           && TARGET_SVE
> > >           && const_vec_series_p (op, &base, &step))
> > >
> > > Richard
> >
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 27e24ba70ab..6b3ca57d0eb 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -22991,7 +22991,7 @@  aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
   if (CONST_VECTOR_P (op)
       && CONST_VECTOR_DUPLICATE_P (op))
     n_elts = CONST_VECTOR_NPATTERNS (op);
-  else if ((vec_flags & VEC_SVE_DATA)
+  else if (which == AARCH64_CHECK_MOV && TARGET_SVE
 	   && const_vec_series_p (op, &base, &step))
     {
       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
@@ -25249,6 +25249,16 @@  aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 
   if (which == AARCH64_CHECK_MOV)
     {
+      if (info.insn == simd_immediate_info::INDEX)
+	{
+	  gcc_assert (TARGET_SVE);
+	  snprintf (templ, sizeof (templ), "index\t%%Z0.%c, #"
+		    HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
+		    element_char, INTVAL (info.u.index.base),
+		    INTVAL (info.u.index.step));
+	  return templ;
+	}
+
       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
 		  ? "msl" : "lsl");
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
index 216699b0536..0940bedd0dd 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_1.c
@@ -10,7 +10,6 @@  dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t1\n\t\.word\t2\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
index d494943a275..218a6601337 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_2.c
@@ -10,7 +10,6 @@  dupq (int x)
   return svdupq_s32 (x, 1, 2, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[0\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t2\n\t\.word\t1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
index 4bc8259df07..245d43b75b5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_3.c
@@ -10,7 +10,6 @@  dupq (int x)
   return svdupq_s32 (0, 1, x, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #0, #1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t0\n\t\.word\t1\n\t\.word\t[^\n]*\n\t\.word\t3\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
index 6f9f9f2f22f..cbee6f27b62 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_4.c
@@ -10,7 +10,6 @@  dupq (int x)
   return svdupq_s32 (0, 1, x, 3);
 }
 
-/* { dg-final { scan-assembler {\tldr\tq[0-9]+,} } } */
+/* { dg-final { scan-assembler {\tindex\tz[0-9]+\.s, #3, #-1} } } */
 /* { dg-final { scan-assembler {\tins\tv[0-9]+\.s\[2\], w0\n} } } */
 /* { dg-final { scan-assembler {\tdup\tz[0-9]+\.q, z[0-9]+\.q\[0\]\n} } } */
-/* { dg-final { scan-assembler {\t\.word\t3\n\t\.word\t[^\n]*\n\t\.word\t1\n\t\.word\t0\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
new file mode 100644
index 00000000000..25910dbfa1f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vec_init_3.c
@@ -0,0 +1,99 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef short v8hi __attribute__ ((vector_size (16)));
+typedef short v4hi __attribute__ ((vector_size (8)));
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef int v2si __attribute__ ((vector_size (8)));
+typedef long v2di __attribute__ ((vector_size (16)));
+
+/*
+** f_v16qi:
+**	index	z0\.b, #0, #1
+**	ret
+*/
+v16qi
+f_v16qi (void)
+{
+  return (v16qi){ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+}
+
+/*
+** f_v8qi:
+**	index	z0\.b, #0, #1
+**	ret
+*/
+v8qi
+f_v8qi (void)
+{
+  return (v8qi){ 0, 1, 2, 3, 4, 5, 6, 7 };
+}
+
+/*
+** f_v8hi:
+**	index	z0\.h, #0, #1
+**	ret
+*/
+v8hi
+f_v8hi (void)
+{
+  return (v8hi){ 0, 1, 2, 3, 4, 5, 6, 7 };
+}
+
+/*
+** f_v4hi:
+**	index	z0\.h, #0, #1
+**	ret
+*/
+v4hi
+f_v4hi (void)
+{
+  return (v4hi){ 0, 1, 2, 3 };
+}
+
+/*
+** f_v4si:
+**	index	z0\.s, #0, #1
+**	ret
+*/
+v4si
+f_v4si (void)
+{
+  return (v4si){ 0, 1, 2, 3 };
+}
+
+/*
+** f_v2si:
+**	index	z0\.s, #0, #1
+**	ret
+*/
+v2si
+f_v2si (void)
+{
+  return (v2si){ 0, 1 };
+}
+
+/*
+** f_v2di:
+**	index	z0\.d, #0, #1
+**	ret
+*/
+v2di
+f_v2di (void)
+{
+  return (v2di){ 0, 1 };
+}
+
+/*
+** g_v4si:
+**	index	z0\.s, #3, #-4
+**	ret
+*/
+v4si
+g_v4si (void)
+{
+  return (v4si){ 3, -1, -5, -9 };
+}