Message ID | ZoaGvrPP2MHCMiwW@arm.com |
---|---|
State | New |
Headers | show |
Series | [1/2] AArch64: make aarch64_simd_vec_unpack<su>_lo_/_hi_ consistent. | expand |
Tamar Christina <tamar.christina@arm.com> writes: > Hi All, > > When a two reg TBL is performed with one operand being a zero vector we can > instead use a single reg TBL and map the indices for accessing the zero vector > to an out of range constant. > > On AArch64 out of range indices into a TBL have a defined semantics of setting > the element to zero. Many uArches have a slower 2-reg TBL than 1-reg TBL. > > Before this change we had: > > typedef unsigned int v4si __attribute__ ((vector_size (16))); > > v4si f1 (v4si a) > { > v4si zeros = {0,0,0,0}; > return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); > } > > which generates: > > f1: > mov v30.16b, v0.16b > movi v31.4s, 0 > adrp x0, .LC0 > ldr q0, [x0, #:lo12:.LC0] > tbl v0.16b, {v30.16b - v31.16b}, v0.16b > ret > > .LC0: > .byte 0 > .byte 1 > .byte 2 > .byte 3 > .byte 20 > .byte 21 > .byte 22 > .byte 23 > .byte 4 > .byte 5 > .byte 6 > .byte 7 > .byte 24 > .byte 25 > .byte 26 > .byte 27 > > and with the patch: > > f1: > adrp x0, .LC0 > ldr q31, [x0, #:lo12:.LC0] > tbl v0.16b, {v0.16b}, v31.16b > ret Nice! > > .LC0: > .byte 0 > .byte 1 > .byte 2 > .byte 3 > .byte -1 > .byte -1 > .byte -1 > .byte -1 > .byte 4 > .byte 5 > .byte 6 > .byte 7 > .byte -1 > .byte -1 > .byte -1 > .byte -1 > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > This sequence is generated often by openmp and aside from the > strict performance impact of this change, it also gives better > register allocation as we no longer have the consecutive > register limitation. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0 and > zero_op1. > (aarch64_evpc_tbl): Implement register value remapping. > (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup > before it's forced to a reg. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/tbl_with_zero.c: New test. > > --- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 6f49d1482042efabedbe723aa59ecf129b84f4ad..655b93e65a7b686a2b82e8ada64cac084ca397d4 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25384,6 +25384,7 @@ struct expand_vec_perm_d > unsigned int vec_flags; > unsigned int op_vec_flags; > bool one_vector_p; > + bool zero_op0, zero_op1; Nit: might as well add _p suffixes for consistency with the existing fields. > bool testing_p; > }; > > @@ -25880,13 +25881,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) > /* to_constant is safe since this routine is specific to Advanced SIMD > vectors. */ > unsigned int nelt = d->perm.length ().to_constant (); > + > + /* If one register is the constant vector of 0 then we only need > + a one reg TBL and we map any accesses to the vector of 0 to -1. We can't > + do this earlier since vec_perm_indices clamps elements to within range so > + we can only do it during codegen. */ > + if (d->zero_op0) > + d->op0 = d->op1; > + else if (d->zero_op1) > + d->op1 = d->op0; > + > for (unsigned int i = 0; i < nelt; ++i) > - /* If big-endian and two vectors we end up with a weird mixed-endian > - mode on NEON. Reverse the index within each word but not the word > - itself. to_constant is safe because we checked is_constant above. */ > - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN > - ? d->perm[i].to_constant () ^ (nelt - 1) > - : d->perm[i].to_constant ()); > + { > + auto val = d->perm[i].to_constant (); > + > + /* If we're selecting from a 0 vector, we can just use an out of range > + index instead. */ > + if ((d->zero_op0 && val < nelt) || (d->zero_op1 && val >= nelt)) > + rperm[i] = constm1_rtx; > + else > + { > + /* If we are remapping a zero register as the first parameter we need > + to adjust the indices of the non-zero register. */ > + if (d->zero_op0) > + val = val % nelt; > + > + /* If big-endian and two vectors we end up with a weird mixed-endian > + mode on NEON. Reverse the index within each word but not the word > + itself. to_constant is safe because we checked is_constant > + above. */ > + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val); > + } > + } > > sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); > sel = force_reg (vmode, sel); > @@ -26132,6 +26158,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, > const vec_perm_indices &sel) > { > struct expand_vec_perm_d d; > + d.zero_op0 = d.zero_op1 = false; > + rtx e; > > /* Check whether the mask can be applied to a single vector. */ > if (sel.ninputs () == 1 > @@ -26147,6 +26175,12 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, > d.one_vector_p = true; > op0 = op1; > } > + else if (op0 && ((d.zero_op0 = const_vec_duplicate_p (op0, &e)) > + && const0_rtx == e)) > + d.one_vector_p = false; > + else if (op1 && ((d.zero_op1 = const_vec_duplicate_p (op1, &e)) > + && const0_rtx == e)) > + d.one_vector_p = false; > else > d.one_vector_p = false; Given the bracketing, this sets d.zero_op0/1 if the operands are a duplicate of any value, not just zero. IMO it'd be simpler to drop the two hunks above and instead add: d.zero_op0_p = op0 == CONST0_RTX (op_mode); d.zero_op1_p = op1 == CONST0_RTX (op_mode); > diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c > new file mode 100644 > index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c > @@ -0,0 +1,40 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O1" } */ > + > +typedef unsigned int v4si __attribute__ ((vector_size (16))); > + > +v4si f1 (v4si a) > +{ > + v4si zeros = {0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); > +} > + > +typedef unsigned short v8hi __attribute__ ((vector_size (16))); > + > +v8hi f2a (v8hi a) > +{ > + v8hi zeros = {0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12); > +} > + > +v8hi f2b (v8hi a) > +{ > + v8hi zeros = {0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8); > +} > + > +typedef unsigned char v16qi __attribute__ ((vector_size (16))); > + > +v16qi f3a (v16qi a) > +{ > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24); > +} > + > +v16qi f3b (v16qi a) > +{ > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12); > +} > + > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */ It'd be good to test with zeros as the first argument too. Thanks, Richard
> > +v16qi f3b (v16qi a) > > +{ > > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; > > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, > 7, 12); > > +} > > + > > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0- > 9]+.16b} 5 } } */ > > It'd be good to test with zeros as the first argument too. > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p and zero_op_p1. (aarch64_evpc_tbl): Implement register value remapping. (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup before it's forced to a reg. gcc/testsuite/ChangeLog: * gcc.target/aarch64/tbl_with_zero_1.c: New test. * gcc.target/aarch64/tbl_with_zero_2.c: New test. -- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25413,6 +25413,7 @@ struct expand_vec_perm_d unsigned int vec_flags; unsigned int op_vec_flags; bool one_vector_p; + bool zero_op0_p, zero_op1_p; bool testing_p; }; @@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) /* to_constant is safe since this routine is specific to Advanced SIMD vectors. */ unsigned int nelt = d->perm.length ().to_constant (); + + /* If one register is the constant vector of 0 then we only need + a one reg TBL and we map any accesses to the vector of 0 to -1. We can't + do this earlier since vec_perm_indices clamps elements to within range so + we can only do it during codegen. */ + if (d->zero_op0_p) + d->op0 = d->op1; + else if (d->zero_op1_p) + d->op1 = d->op0; + for (unsigned int i = 0; i < nelt; ++i) - /* If big-endian and two vectors we end up with a weird mixed-endian - mode on NEON. Reverse the index within each word but not the word - itself. to_constant is safe because we checked is_constant above. */ - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN - ? d->perm[i].to_constant () ^ (nelt - 1) - : d->perm[i].to_constant ()); + { + auto val = d->perm[i].to_constant (); + + /* If we're selecting from a 0 vector, we can just use an out of range + index instead. */ + if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt)) + rperm[i] = constm1_rtx; + else + { + /* If we are remapping a zero register as the first parameter we need + to adjust the indices of the non-zero register. */ + if (d->zero_op0_p) + val = val % nelt; + + /* If big-endian and two vectors we end up with a weird mixed-endian + mode on NEON. Reverse the index within each word but not the word + itself. to_constant is safe because we checked is_constant + above. */ + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val); + } + } sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); sel = force_reg (vmode, sel); @@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, const vec_perm_indices &sel) { struct expand_vec_perm_d d; + d.zero_op0_p = d.zero_op1_p = false; /* Check whether the mask can be applied to a single vector. */ if (sel.ninputs () == 1 @@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, else d.one_vector_p = false; + d.zero_op0_p = op0 == CONST0_RTX (op_mode); + d.zero_op1_p = op1 == CONST0_RTX (op_mode); d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, sel.nelts_per_input ()); d.vmode = vmode; diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c new file mode 100644 index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ + +typedef unsigned int v4si __attribute__ ((vector_size (16))); + +v4si f1 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); +} + +typedef unsigned short v8hi __attribute__ ((vector_size (16))); + +v8hi f2a (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12); +} + +v8hi f2b (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8); +} + +typedef unsigned char v16qi __attribute__ ((vector_size (16))); + +v16qi f3a (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24); +} + +v16qi f3b (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12); +} + +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c new file mode 100644 index 0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target le } */ +/* { dg-additional-options "-O1" } */ + +typedef unsigned int v4si __attribute__ ((vector_size (16))); + +v4si f1 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (zeros, a, 0, 5, 1, 6); +} + +v4si f2 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); +} + +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */ +/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */
Tamar Christina <Tamar.Christina@arm.com> writes: >> > +v16qi f3b (v16qi a) >> > +{ >> > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; >> > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, >> 7, 12); >> > +} >> > + >> > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0- >> 9]+.16b} 5 } } */ >> >> It'd be good to test with zeros as the first argument too. >> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64.cc (struct expand_vec_perm_d): Add zero_op0_p > and zero_op_p1. > (aarch64_evpc_tbl): Implement register value remapping. > (aarch64_vectorize_vec_perm_const): Detect if operand is a zero dup > before it's forced to a reg. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/tbl_with_zero_1.c: New test. > * gcc.target/aarch64/tbl_with_zero_2.c: New test. > > -- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index 469eb938953a70bc6b0ce3d4aa16f773e40ee03e..2d596c19a31a09b4ccbc957d42dce91e453a0dec 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -25413,6 +25413,7 @@ struct expand_vec_perm_d > unsigned int vec_flags; > unsigned int op_vec_flags; > bool one_vector_p; > + bool zero_op0_p, zero_op1_p; > bool testing_p; > }; > > @@ -25909,13 +25910,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) > /* to_constant is safe since this routine is specific to Advanced SIMD > vectors. */ > unsigned int nelt = d->perm.length ().to_constant (); > + > + /* If one register is the constant vector of 0 then we only need > + a one reg TBL and we map any accesses to the vector of 0 to -1. We can't > + do this earlier since vec_perm_indices clamps elements to within range so > + we can only do it during codegen. */ > + if (d->zero_op0_p) > + d->op0 = d->op1; > + else if (d->zero_op1_p) > + d->op1 = d->op0; > + > for (unsigned int i = 0; i < nelt; ++i) > - /* If big-endian and two vectors we end up with a weird mixed-endian > - mode on NEON. Reverse the index within each word but not the word > - itself. to_constant is safe because we checked is_constant above. */ > - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN > - ? d->perm[i].to_constant () ^ (nelt - 1) > - : d->perm[i].to_constant ()); > + { > + auto val = d->perm[i].to_constant (); > + > + /* If we're selecting from a 0 vector, we can just use an out of range > + index instead. */ > + if ((d->zero_op0_p && val < nelt) || (d->zero_op1_p && val >= nelt)) > + rperm[i] = constm1_rtx; > + else > + { > + /* If we are remapping a zero register as the first parameter we need > + to adjust the indices of the non-zero register. */ > + if (d->zero_op0_p) > + val = val % nelt; > + > + /* If big-endian and two vectors we end up with a weird mixed-endian > + mode on NEON. Reverse the index within each word but not the word > + itself. to_constant is safe because we checked is_constant > + above. */ > + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val); > + } > + } > > sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); > sel = force_reg (vmode, sel); > @@ -26161,6 +26187,7 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, > const vec_perm_indices &sel) > { > struct expand_vec_perm_d d; > + d.zero_op0_p = d.zero_op1_p = false; This is redundant with the assignments below. OK with that line removed. Not sure whether the test will work on Darwin & mingw, but we can restrict to ELF targets later if necessary. Thanks, Richard > > /* Check whether the mask can be applied to a single vector. */ > if (sel.ninputs () == 1 > @@ -26179,6 +26206,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, > else > d.one_vector_p = false; > > + d.zero_op0_p = op0 == CONST0_RTX (op_mode); > + d.zero_op1_p = op1 == CONST0_RTX (op_mode); > d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, > sel.nelts_per_input ()); > d.vmode = vmode; > diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c > new file mode 100644 > index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_1.c > @@ -0,0 +1,40 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O1" } */ > + > +typedef unsigned int v4si __attribute__ ((vector_size (16))); > + > +v4si f1 (v4si a) > +{ > + v4si zeros = {0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); > +} > + > +typedef unsigned short v8hi __attribute__ ((vector_size (16))); > + > +v8hi f2a (v8hi a) > +{ > + v8hi zeros = {0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12); > +} > + > +v8hi f2b (v8hi a) > +{ > + v8hi zeros = {0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8); > +} > + > +typedef unsigned char v16qi __attribute__ ((vector_size (16))); > + > +v16qi f3a (v16qi a) > +{ > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24); > +} > + > +v16qi f3b (v16qi a) > +{ > + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12); > +} > + > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c > new file mode 100644 > index 0000000000000000000000000000000000000000..e7d5a678aa5178c00036fd91fc4d776f188d898e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero_2.c > @@ -0,0 +1,20 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target le } */ > +/* { dg-additional-options "-O1" } */ > + > +typedef unsigned int v4si __attribute__ ((vector_size (16))); > + > +v4si f1 (v4si a) > +{ > + v4si zeros = {0,0,0,0}; > + return __builtin_shufflevector (zeros, a, 0, 5, 1, 6); > +} > + > +v4si f2 (v4si a) > +{ > + v4si zeros = {0,0,0,0}; > + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); > +} > + > +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 2 } } */ > +/* { dg-final { scan-assembler-times {(\.byte\s+-1\n\s+){4}(\.byte\s+[4-7]+\n\s+){4}(\.byte\s+-1\n\s+){4}(\.byte\s+(8|9|10|11)+\n?\s*){4}} 1 } } */
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 6f49d1482042efabedbe723aa59ecf129b84f4ad..655b93e65a7b686a2b82e8ada64cac084ca397d4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -25384,6 +25384,7 @@ struct expand_vec_perm_d unsigned int vec_flags; unsigned int op_vec_flags; bool one_vector_p; + bool zero_op0, zero_op1; bool testing_p; }; @@ -25880,13 +25881,38 @@ aarch64_evpc_tbl (struct expand_vec_perm_d *d) /* to_constant is safe since this routine is specific to Advanced SIMD vectors. */ unsigned int nelt = d->perm.length ().to_constant (); + + /* If one register is the constant vector of 0 then we only need + a one reg TBL and we map any accesses to the vector of 0 to -1. We can't + do this earlier since vec_perm_indices clamps elements to within range so + we can only do it during codegen. */ + if (d->zero_op0) + d->op0 = d->op1; + else if (d->zero_op1) + d->op1 = d->op0; + for (unsigned int i = 0; i < nelt; ++i) - /* If big-endian and two vectors we end up with a weird mixed-endian - mode on NEON. Reverse the index within each word but not the word - itself. to_constant is safe because we checked is_constant above. */ - rperm[i] = GEN_INT (BYTES_BIG_ENDIAN - ? d->perm[i].to_constant () ^ (nelt - 1) - : d->perm[i].to_constant ()); + { + auto val = d->perm[i].to_constant (); + + /* If we're selecting from a 0 vector, we can just use an out of range + index instead. */ + if ((d->zero_op0 && val < nelt) || (d->zero_op1 && val >= nelt)) + rperm[i] = constm1_rtx; + else + { + /* If we are remapping a zero register as the first parameter we need + to adjust the indices of the non-zero register. */ + if (d->zero_op0) + val = val % nelt; + + /* If big-endian and two vectors we end up with a weird mixed-endian + mode on NEON. Reverse the index within each word but not the word + itself. to_constant is safe because we checked is_constant + above. */ + rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? val ^ (nelt - 1) : val); + } + } sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); sel = force_reg (vmode, sel); @@ -26132,6 +26158,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, const vec_perm_indices &sel) { struct expand_vec_perm_d d; + d.zero_op0 = d.zero_op1 = false; + rtx e; /* Check whether the mask can be applied to a single vector. */ if (sel.ninputs () == 1 @@ -26147,6 +26175,12 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, d.one_vector_p = true; op0 = op1; } + else if (op0 && ((d.zero_op0 = const_vec_duplicate_p (op0, &e)) + && const0_rtx == e)) + d.one_vector_p = false; + else if (op1 && ((d.zero_op1 = const_vec_duplicate_p (op1, &e)) + && const0_rtx == e)) + d.one_vector_p = false; else d.one_vector_p = false; diff --git a/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..5595127f3302164b1eb06be50d5c37d41095eb06 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/tbl_with_zero.c @@ -0,0 +1,40 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ + +typedef unsigned int v4si __attribute__ ((vector_size (16))); + +v4si f1 (v4si a) +{ + v4si zeros = {0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6); +} + +typedef unsigned short v8hi __attribute__ ((vector_size (16))); + +v8hi f2a (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 9, 1, 10, 2, 11, 3, 12); +} + +v8hi f2b (v8hi a) +{ + v8hi zeros = {0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8); +} + +typedef unsigned char v16qi __attribute__ ((vector_size (16))); + +v16qi f3a (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7, 24); +} + +v16qi f3b (v16qi a) +{ + v16qi zeros = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + return __builtin_shufflevector (a, zeros, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12); +} + +/* { dg-final { scan-assembler-times {tbl\tv[0-9]+.16b, \{v[0-9]+.16b\}, v[0-9]+.16b} 5 } } */