diff mbox series

[2/2] AArch64: lower 2 reg TBL permutes with one zero register to 1 reg TBL.

Message ID ZoaGvrPP2MHCMiwW@arm.com
State New
Headers show
Series [1/2] AArch64: make aarch64_simd_vec_unpack<su>_lo_/_hi_ consistent. | expand

Commit Message

Tamar Christina July 4, 2024, 11:25 a.m. UTC
Hi All,

When a two reg TBL is performed with one operand being a zero vector we can
instead use a single reg TBL and map the indices for accessing the zero vector
to an out of range constant.

On AArch64 out of range indices into a TBL have a defined semantics of setting
the element to zero.  Many uArches have a slower 2-reg TBL than 1-reg TBL.

Before this change we had:

typedef unsigned int v4si __attribute__ ((vector_size (16)));

v4si f1 (v4si a)
{
  v4si zeros = {0,0,0,0};
  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
}

which generates:

f1:
        mov     v30.16b, v0.16b
        movi    v31.4s, 0
        adrp    x0, .LC0
        ldr     q0, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v30.16b - v31.16b}, v0.16b
        ret

.LC0:
        .byte   0
        .byte   1
        .byte   2
        .byte   3
        .byte   20
        .byte   21
        .byte   22
        .byte   23
        .byte   4
        .byte   5
        .byte   6
        .byte   7
        .byte   24
        .byte   25
        .byte   26
        .byte   27

and with the patch:

f1:
        adrp    x0, .LC0
        ldr     q31, [x0, #:lo12:.LC0]
        tbl     v0.16b, {v0.16b}, v31.16b
        ret

.LC0:
        .byte   0
        .byte   1
        .byte   2
        .byte   3
        .byte   -1
        .byte   -1
        .byte   -1
        .byte   -1
        .byte   4
        .byte   5
        .byte   6
        .byte   7
        .byte   -1
        .byte   -1
        .byte   -1
        .byte   -1

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

This sequence is generated often by openmp and aside from the
strict performance impact of this change, it also gives better
register allocation as we no longer have the consecutive
register limitation.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0 and
	zero_op1.
	(aarch64_evpc_tbl): Implement register value remapping.
	(aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
	before it's forced to a reg.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/tbl_with_zero.c: New test.

---




--

Comments

Richard Sandiford July 4, 2024, 12:01 p.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> When a two reg TBL is performed with one operand being a zero vector we can
> instead use a single reg TBL and map the indices for accessing the zero vector
> to an out of range constant.
>
> On AArch64 out of range indices into a TBL have a defined semantics of setting
> the element to zero.  Many uArches have a slower 2-reg TBL than 1-reg TBL.
>
> Before this change we had:
>
> typedef unsigned int v4si __attribute__ ((vector_size (16)));
>
> v4si f1 (v4si a)
> {
>   v4si zeros = {0,0,0,0};
>   return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> }
>
> which generates:
>
> f1:
>         mov     v30.16b, v0.16b
>         movi    v31.4s, 0
>         adrp    x0, .LC0
>         ldr     q0, [x0, #:lo12:.LC0]
>         tbl     v0.16b, {v30.16b - v31.16b}, v0.16b
>         ret
>
> .LC0:
>         .byte   0
>         .byte   1
>         .byte   2
>         .byte   3
>         .byte   20
>         .byte   21
>         .byte   22
>         .byte   23
>         .byte   4
>         .byte   5
>         .byte   6
>         .byte   7
>         .byte   24
>         .byte   25
>         .byte   26
>         .byte   27
>
> and with the patch:
>
> f1:
>         adrp    x0, .LC0
>         ldr     q31, [x0, #:lo12:.LC0]
>         tbl     v0.16b, {v0.16b}, v31.16b
>         ret

Nice!

>
> .LC0:
>         .byte   0
>         .byte   1
>         .byte   2
>         .byte   3
>         .byte   -1
>         .byte   -1
>         .byte   -1
>         .byte   -1
>         .byte   4
>         .byte   5
>         .byte   6
>         .byte   7
>         .byte   -1
>         .byte   -1
>         .byte   -1
>         .byte   -1
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> This sequence is generated often by openmp and aside from the
> strict performance impact of this change, it also gives better
> register allocation as we no longer have the consecutive
> register limitation.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0 and
> 	zero_op1.
> 	(aarch64_evpc_tbl): Implement register value remapping.
> 	(aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
> 	before it's forced to a reg.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/tbl_with_zero.c: New test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 6f49d1482042efabedbe723aa59ecf129b84f4ad..655b93e65a7b686a2b82e8ada64cac084ca397d4 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25384,6 +25384,7 @@ struct expand_vec_perm_d
>    unsigned int vec_flags;
>    unsigned int op_vec_flags;
>    bool one_vector_p;
> +  bool zero_op0, zero_op1;

Nit: might as well add _p suffixes for consistency with the existing fields.

>    bool testing_p;
>  };
>  
> @@ -25880,13 +25881,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
>    /* to_constant is safe since this routine is specific to Advanced SIMD
>       vectors.  */
>    unsigned int nelt = d->perm.length ().to_constant ();
> +
> +  /* If one register is the constant vector of 0 then we only need
> +     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
> +     do this earlier since vec_perm_indices clamps elements to within range so
> +     we can only do it during codegen.  */
> +  if (d->zero_op0)
> +    d->op0 = d->op1;
> +  else if (d->zero_op1)
> +    d->op1 = d->op0;
> +
>    for (unsigned int i = 0; i < nelt; ++i)
> -    /* If big-endian and two vectors we end up with a weird mixed-endian
> -       mode on NEON.  Reverse the index within each word but not the word
> -       itself.  to_constant is safe because we checked is_constant above.  */
> -    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
> -			? d->perm[i].to_constant () ^ (nelt - 1)
> -			: d->perm[i].to_constant ());
> +    {
> +      auto val = d->perm[i].to_constant ();
> +
> +      /* If we're selecting from a 0 vector, we can just use an out of range
> +	 index instead.  */
> +      if ((d->zero_op0 && val < nelt) || (d->zero_op1 && val >= nelt))
> +	rperm[i] = constm1_rtx;
> +      else
> +	{
> +	  /* If we are remapping a zero register as the first parameter we need
> +	     to adjust the indices of the non-zero register.  */
> +	  if (d->zero_op0)
> +	    val = val % nelt;
> +
> +	  /* If big-endian and two vectors we end up with a weird mixed-endian
> +	     mode on NEON.  Reverse the index within each word but not the word
> +	     itself.  to_constant is safe because we checked is_constant
> +	     above.  */
> +	  rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
> +	}
> +    }
>  
>    sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
>    sel = force_reg (vmode, sel);
> @@ -26132,6 +26158,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>  				  const vec_perm_indices &sel)
>  {
>    struct expand_vec_perm_d d;
> +  d.zero_op0 = d.zero_op1 = false;
> +  rtx e;
>  
>    /* Check whether the mask can be applied to a single vector.  */
>    if (sel.ninputs () == 1
> @@ -26147,6 +26175,12 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>        d.one_vector_p = true;
>        op0 = op1;
>      }
> +  else if (op0 && ((d.zero_op0 = const_vec_duplicate_p (op0, &e))
> +		   && const0_rtx == e))
> +    d.one_vector_p = false;
> +  else if (op1 && ((d.zero_op1 = const_vec_duplicate_p (op1, &e))
> +		   && const0_rtx == e))
> +    d.one_vector_p = false;
>    else
>      d.one_vector_p = false;

Given the bracketing, this sets d.zero_op0/1 if the operands are a
duplicate of any value, not just zero.

IMO it'd be simpler to drop the two hunks above and instead add:

  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
  d.zero_op1_p = op1 == CONST0_RTX (op_mode);

> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +typedef unsigned short v8hi __attribute__ ((vector_size (16)));
> +
> +v8hi f2a (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
> +}
> +
> +v8hi f2b (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
> +}
> +
> +typedef unsigned char v16qi __attribute__ ((vector_size (16)));
> +
> +v16qi f3a (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24);
> +}
> +
> +v16qi f3b (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */

It'd be good to test with zeros as the first argument too.

Thanks,
Richard
Tamar Christina July 5, 2024, 8:16 a.m. UTC | #2
> > +v16qi f3b (v16qi a)
> > +{
> > +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> > +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,
> 7, 12);
> > +}
> > +
> > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-
> 9]+.16b} 5 } } */
> 
> It'd be good to test with zeros as the first argument too.
> 

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p
	and zero_op_p1.
	(aarch64_evpc_tbl): Implement register value remapping.
	(aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
	before it's forced to a reg.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/tbl_with_zero_1.c: New test.
	* gcc.target/aarch64/tbl_with_zero_2.c: New test.

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25413,6 +25413,7 @@ struct expand_vec_perm_d
   unsigned int vec_flags;
   unsigned int op_vec_flags;
   bool one_vector_p;
+  bool zero_op0_p, zero_op1_p;
   bool testing_p;
 };
 
@@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
   /* to_constant is safe since this routine is specific to Advanced SIMD
      vectors.  */
   unsigned int nelt = d->perm.length ().to_constant ();
+
+  /* If one register is the constant vector of 0 then we only need
+     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
+     do this earlier since vec_perm_indices clamps elements to within range so
+     we can only do it during codegen.  */
+  if (d->zero_op0_p)
+    d->op0 = d->op1;
+  else if (d->zero_op1_p)
+    d->op1 = d->op0;
+
   for (unsigned int i = 0; i < nelt; ++i)
-    /* If big-endian and two vectors we end up with a weird mixed-endian
-       mode on NEON.  Reverse the index within each word but not the word
-       itself.  to_constant is safe because we checked is_constant above.  */
-    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
-			? d->perm[i].to_constant () ^ (nelt - 1)
-			: d->perm[i].to_constant ());
+    {
+      auto val = d->perm[i].to_constant ();
+
+      /* If we're selecting from a 0 vector, we can just use an out of range
+	 index instead.  */
+      if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
+	rperm[i] = constm1_rtx;
+      else
+	{
+	  /* If we are remapping a zero register as the first parameter we need
+	     to adjust the indices of the non-zero register.  */
+	  if (d->zero_op0_p)
+	    val = val % nelt;
+
+	  /* If big-endian and two vectors we end up with a weird mixed-endian
+	     mode on NEON.  Reverse the index within each word but not the word
+	     itself.  to_constant is safe because we checked is_constant
+	     above.  */
+	  rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
+	}
+    }
 
   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   sel = force_reg (vmode, sel);
@@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  const vec_perm_indices &sel)
 {
   struct expand_vec_perm_d d;
+  d.zero_op0_p = d.zero_op1_p = false;
 
   /* Check whether the mask can be applied to a single vector.  */
   if (sel.ninputs () == 1
@@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
   else
     d.one_vector_p = false;
 
+  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
+  d.zero_op1_p = op1 == CONST0_RTX (op_mode);
   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
 		     sel.nelts_per_input ());
   d.vmode = vmode;
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+typedef unsigned short v8hi __attribute__ ((vector_size (16)));
+
+v8hi f2a (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
+}
+
+v8hi f2b (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
+}
+
+typedef unsigned char v16qi __attribute__ ((vector_size (16)));
+
+v16qi f3a (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24);
+}
+
+v16qi f3b (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target le } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (zeros, a, 0, 5, 1, 6);
+}
+
+v4si f2 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */
+/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */
Richard Sandiford July 5, 2024, 8:36 a.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> > +v16qi f3b (v16qi a)
>> > +{
>> > +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
>> > +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11,
>> 7, 12);
>> > +}
>> > +
>> > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-
>> 9]+.16b} 5 } } */
>> 
>> It'd be good to test with zeros as the first argument too.
>> 
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p
> 	and zero_op_p1.
> 	(aarch64_evpc_tbl): Implement register value remapping.
> 	(aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup
> 	before it's forced to a reg.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/tbl_with_zero_1.c: New test.
> 	* gcc.target/aarch64/tbl_with_zero_2.c: New test.
>
> -- inline copy of patch --
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -25413,6 +25413,7 @@ struct expand_vec_perm_d
>    unsigned int vec_flags;
>    unsigned int op_vec_flags;
>    bool one_vector_p;
> +  bool zero_op0_p, zero_op1_p;
>    bool testing_p;
>  };
>  
> @@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d)
>    /* to_constant is safe since this routine is specific to Advanced SIMD
>       vectors.  */
>    unsigned int nelt = d->perm.length ().to_constant ();
> +
> +  /* If one register is the constant vector of 0 then we only need
> +     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
> +     do this earlier since vec_perm_indices clamps elements to within range so
> +     we can only do it during codegen.  */
> +  if (d->zero_op0_p)
> +    d->op0 = d->op1;
> +  else if (d->zero_op1_p)
> +    d->op1 = d->op0;
> +
>    for (unsigned int i = 0; i < nelt; ++i)
> -    /* If big-endian and two vectors we end up with a weird mixed-endian
> -       mode on NEON.  Reverse the index within each word but not the word
> -       itself.  to_constant is safe because we checked is_constant above.  */
> -    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
> -			? d->perm[i].to_constant () ^ (nelt - 1)
> -			: d->perm[i].to_constant ());
> +    {
> +      auto val = d->perm[i].to_constant ();
> +
> +      /* If we're selecting from a 0 vector, we can just use an out of range
> +	 index instead.  */
> +      if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt))
> +	rperm[i] = constm1_rtx;
> +      else
> +	{
> +	  /* If we are remapping a zero register as the first parameter we need
> +	     to adjust the indices of the non-zero register.  */
> +	  if (d->zero_op0_p)
> +	    val = val % nelt;
> +
> +	  /* If big-endian and two vectors we end up with a weird mixed-endian
> +	     mode on NEON.  Reverse the index within each word but not the word
> +	     itself.  to_constant is safe because we checked is_constant
> +	     above.  */
> +	  rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
> +	}
> +    }
>  
>    sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
>    sel = force_reg (vmode, sel);
> @@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>  				  const vec_perm_indices &sel)
>  {
>    struct expand_vec_perm_d d;
> +  d.zero_op0_p = d.zero_op1_p = false;

This is redundant with the assignments below.  OK with that line removed.

Not sure whether the test will work on Darwin & mingw, but we can restrict
to ELF targets later if necessary.

Thanks,
Richard

>  
>    /* Check whether the mask can be applied to a single vector.  */
>    if (sel.ninputs () == 1
> @@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
>    else
>      d.one_vector_p = false;
>  
> +  d.zero_op0_p = op0 == CONST0_RTX (op_mode);
> +  d.zero_op1_p = op1 == CONST0_RTX (op_mode);
>    d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
>  		     sel.nelts_per_input ());
>    d.vmode = vmode;
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +typedef unsigned short v8hi __attribute__ ((vector_size (16)));
> +
> +v8hi f2a (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
> +}
> +
> +v8hi f2b (v8hi a)
> +{
> +  v8hi zeros = {0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
> +}
> +
> +typedef unsigned char v16qi __attribute__ ((vector_size (16)));
> +
> +v16qi f3a (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24);
> +}
> +
> +v16qi f3b (v16qi a)
> +{
> +  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target le } */
> +/* { dg-additional-options "-O1" } */
> +
> +typedef unsigned int v4si __attribute__ ((vector_size (16)));
> +
> +v4si f1 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (zeros, a, 0, 5, 1, 6);
> +}
> +
> +v4si f2 (v4si a)
> +{
> +  v4si zeros = {0,0,0,0};
> +  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
> +}
> +
> +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */
> +/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 6f49d1482042efabedbe723aa59ecf129b84f4ad..655b93e65a7b686a2b82e8ada64cac084ca397d4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -25384,6 +25384,7 @@  struct expand_vec_perm_d
   unsigned int vec_flags;
   unsigned int op_vec_flags;
   bool one_vector_p;
+  bool zero_op0, zero_op1;
   bool testing_p;
 };
 
@@ -25880,13 +25881,38 @@  aarch64_evpc_tbl (struct expand_vec_perm_d *d)
   /* to_constant is safe since this routine is specific to Advanced SIMD
      vectors.  */
   unsigned int nelt = d->perm.length ().to_constant ();
+
+  /* If one register is the constant vector of 0 then we only need
+     a one reg TBL and we map any accesses to the vector of 0 to -1.  We can't
+     do this earlier since vec_perm_indices clamps elements to within range so
+     we can only do it during codegen.  */
+  if (d->zero_op0)
+    d->op0 = d->op1;
+  else if (d->zero_op1)
+    d->op1 = d->op0;
+
   for (unsigned int i = 0; i < nelt; ++i)
-    /* If big-endian and two vectors we end up with a weird mixed-endian
-       mode on NEON.  Reverse the index within each word but not the word
-       itself.  to_constant is safe because we checked is_constant above.  */
-    rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
-			? d->perm[i].to_constant () ^ (nelt - 1)
-			: d->perm[i].to_constant ());
+    {
+      auto val = d->perm[i].to_constant ();
+
+      /* If we're selecting from a 0 vector, we can just use an out of range
+	 index instead.  */
+      if ((d->zero_op0 && val < nelt) || (d->zero_op1 && val >= nelt))
+	rperm[i] = constm1_rtx;
+      else
+	{
+	  /* If we are remapping a zero register as the first parameter we need
+	     to adjust the indices of the non-zero register.  */
+	  if (d->zero_op0)
+	    val = val % nelt;
+
+	  /* If big-endian and two vectors we end up with a weird mixed-endian
+	     mode on NEON.  Reverse the index within each word but not the word
+	     itself.  to_constant is safe because we checked is_constant
+	     above.  */
+	  rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val);
+	}
+    }
 
   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
   sel = force_reg (vmode, sel);
@@ -26132,6 +26158,8 @@  aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
 				  const vec_perm_indices &sel)
 {
   struct expand_vec_perm_d d;
+  d.zero_op0 = d.zero_op1 = false;
+  rtx e;
 
   /* Check whether the mask can be applied to a single vector.  */
   if (sel.ninputs () == 1
@@ -26147,6 +26175,12 @@  aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
       d.one_vector_p = true;
       op0 = op1;
     }
+  else if (op0 && ((d.zero_op0 = const_vec_duplicate_p (op0, &e))
+		   && const0_rtx == e))
+    d.one_vector_p = false;
+  else if (op1 && ((d.zero_op1 = const_vec_duplicate_p (op1, &e))
+		   && const0_rtx == e))
+    d.one_vector_p = false;
   else
     d.one_vector_p = false;
 
diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
new file mode 100644
index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O1" } */
+
+typedef unsigned int v4si __attribute__ ((vector_size (16)));
+
+v4si f1 (v4si a)
+{
+  v4si zeros = {0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6);
+}
+
+typedef unsigned short v8hi __attribute__ ((vector_size (16)));
+
+v8hi f2a (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12);
+}
+
+v8hi f2b (v8hi a)
+{
+  v8hi zeros = {0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8);
+}
+
+typedef unsigned char v16qi __attribute__ ((vector_size (16)));
+
+v16qi f3a (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24);
+}
+
+v16qi f3b (v16qi a)
+{
+  v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12);
+}
+
+/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */