Message ID | 87pp4m8mkp.fsf@e105548-lin.cambridge.arm.com |
---|---|
State | New |
Headers | show |
On Tue, 23 Jun 2015, Richard Sandiford wrote: > +/* Vector comparisons are defined to produce all-one or all-zero results. */ > +(simplify > + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (convert @0))) I am trying to understand why the test tree_nop_conversion_p is the right one (at least for the transformations not using VIEW_CONVERT_EXPR). By definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector types of the same size and number of elements. It thus seems like a conversion is always fine. For vectors, tree_nop_conversion_p apparently only checks that they have the same mode (quite often VOIDmode I guess). > +/* We could instead convert all instances of the vec_cond to negate, > + but that isn't necessarily a win on its own. */ > +(simplify > + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (minus @3 (convert @0)))) > + > +(simplify > + (plus:c @3 (view_convert_expr Aren't we suppose to drop _expr in match.pd? > + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (minus @3 (convert @0)))) > + > +(simplify > + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (plus @3 (convert @0)))) > + > +(simplify > + (minus @3 (view_convert_expr > + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > + (plus @3 (convert @0)))) > + > /* Simplifications of comparisons. */ > > /* We can simplify a logical negation of a comparison to the > Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c > =================================================================== > --- /dev/null 2015-06-02 17:27:28.541944012 +0100 > +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-23 12:06:27.120203685 +0100 > @@ -0,0 +1,94 @@ > +/* Make sure that vector comaprison results are not unnecessarily ANDed > + with vectors of 1. */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +#define COUNT1(X) if (X) count += 1 > +#define COUNT2(X) if (X) count -= 1 > +#define COUNT3(X) count += (X) > +#define COUNT4(X) count -= (X) > + > +#define COND1(X) (X) > +#define COND2(X) ((X) ? 1 : 0) > +#define COND3(X) ((X) ? -1 : 0) > +#define COND4(X) ((X) ? 0 : 1) > +#define COND5(X) ((X) ? 0 : -1) > + > +#define TEST_LT(X, Y) ((X) < (Y)) > +#define TEST_LE(X, Y) ((X) <= (Y)) > +#define TEST_GT(X, Y) ((X) > (Y)) > +#define TEST_GE(X, Y) ((X) >= (Y)) > +#define TEST_EQ(X, Y) ((X) == (Y)) > +#define TEST_NE(X, Y) ((X) != (Y)) > + > +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ > + TYPE \ > + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + TYPE count = 0; \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + COUNT (TEST (CMP_ARRAY[i], x)); \ > + return count; \ > + } > + > +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ > + void \ > + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ > + } \ > + void \ > + plusc_##ID (void) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ > + } \ > + void \ > + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ > + } \ > + void \ > + minusc_##ID (void) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ > + } > + > +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ > + typedef __typeof__(ARRAY[0]) ID##_type; \ > + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ > + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ > + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ > + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ > + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ > + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ > + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ > + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ > + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) > + > +signed int asi[1024] __attribute__ ((aligned (16))); > +unsigned int aui[1024] __attribute__ ((aligned (16))); > +signed long long asl[1024] __attribute__ ((aligned (16))); > +unsigned long long aul[1024] __attribute__ ((aligned (16))); > +float af[1024] __attribute__ ((aligned (16))); > +double ad[1024] __attribute__ ((aligned (16))); > + > +ALL_LOOPS (si_si, aui, asi, TEST_LT) > +ALL_LOOPS (ui_si, aui, asi, TEST_LE) > +ALL_LOOPS (si_ui, aui, asi, TEST_GT) > +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) > +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) > +ALL_LOOPS (ul_ul, aul, aul, TEST_NE) > +ALL_LOOPS (si_f, asi, af, TEST_LE) > +ALL_LOOPS (ui_f, aui, af, TEST_GT) > +ALL_LOOPS (sl_d, asl, ad, TEST_GE) > +ALL_LOOPS (ul_d, aul, ad, TEST_GT) > + > +/* { dg-final { scan-assembler-not "\tand\t" } } */ > +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ > +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ > +/* { dg-final { scan-assembler "\tldr\tq" } } */ > +/* { dg-final { scan-assembler "\tstr\tq" } } */
On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote: > On Tue, 23 Jun 2015, Richard Sandiford wrote: > >> +/* Vector comparisons are defined to produce all-one or all-zero results. >> */ >> +(simplify >> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> + (convert @0))) > > > I am trying to understand why the test tree_nop_conversion_p is the right > one (at least for the transformations not using VIEW_CONVERT_EXPR). By > definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector > types of the same size and number of elements. It thus seems like a > conversion is always fine. For vectors, tree_nop_conversion_p apparently > only checks that they have the same mode (quite often VOIDmode I guess). The only conversion we seem to allow is changing the signed vector from the comparison result to an unsigned vector (same number of elements and same mode of the elements). That is, a check using TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably be better (well, technically a TYPE_VECTOR_SUBPARTS && element mode compare should be better as generic vectors might not have a vector mode). I'm fine with using tree_nop_conversion_p for now. >> +/* We could instead convert all instances of the vec_cond to negate, >> + but that isn't necessarily a win on its own. */ so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks more compact at least ;) It would also simplify the patterns below. I'm missing a comment on the transform done by the following patterns. >> +(simplify >> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> + (minus @3 (convert @0)))) >> + >> +(simplify >> + (plus:c @3 (view_convert_expr > > > Aren't we suppose to drop _expr in match.pd? Yes. I probably should adjust genmatch.c to reject the _expr variants ;) > >> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> + (minus @3 (convert @0)))) >> + >> +(simplify >> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> + (plus @3 (convert @0)))) >> + >> +(simplify >> + (minus @3 (view_convert_expr >> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> + (plus @3 (convert @0)))) >> + Generally for sign-conversions of vectors you should use view_convert. The above also hints at missing conditional view_convert support and a way to iterate over commutative vs. non-commutative ops so we could write (for op (plus:c minus) rop (minus plus) (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) (rop @3 (view_convert @0))))) I'll see implementing that. Richard. >> /* Simplifications of comparisons. */ >> >> /* We can simplify a logical negation of a comparison to the >> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c >> =================================================================== >> --- /dev/null 2015-06-02 17:27:28.541944012 +0100 >> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-23 >> 12:06:27.120203685 +0100 >> @@ -0,0 +1,94 @@ >> +/* Make sure that vector comaprison results are not unnecessarily ANDed >> + with vectors of 1. */ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -ftree-vectorize" } */ >> + >> +#define COUNT1(X) if (X) count += 1 >> +#define COUNT2(X) if (X) count -= 1 >> +#define COUNT3(X) count += (X) >> +#define COUNT4(X) count -= (X) >> + >> +#define COND1(X) (X) >> +#define COND2(X) ((X) ? 1 : 0) >> +#define COND3(X) ((X) ? -1 : 0) >> +#define COND4(X) ((X) ? 0 : 1) >> +#define COND5(X) ((X) ? 0 : -1) >> + >> +#define TEST_LT(X, Y) ((X) < (Y)) >> +#define TEST_LE(X, Y) ((X) <= (Y)) >> +#define TEST_GT(X, Y) ((X) > (Y)) >> +#define TEST_GE(X, Y) ((X) >= (Y)) >> +#define TEST_EQ(X, Y) ((X) == (Y)) >> +#define TEST_NE(X, Y) ((X) != (Y)) >> + >> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ >> + TYPE \ >> + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ >> + { \ >> + TYPE count = 0; \ >> + for (unsigned int i = 0; i < 1024; ++i) \ >> + COUNT (TEST (CMP_ARRAY[i], x)); \ >> + return count; \ >> + } >> + >> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ >> + void \ >> + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ >> + { \ >> + for (unsigned int i = 0; i < 1024; ++i) \ >> + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ >> + } \ >> + void \ >> + plusc_##ID (void) \ >> + { \ >> + for (unsigned int i = 0; i < 1024; ++i) \ >> + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ >> + } \ >> + void \ >> + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ >> + { \ >> + for (unsigned int i = 0; i < 1024; ++i) \ >> + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ >> + } \ >> + void \ >> + minusc_##ID (void) \ >> + { \ >> + for (unsigned int i = 0; i < 1024; ++i) \ >> + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ >> + } >> + >> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ >> + typedef __typeof__(ARRAY[0]) ID##_type; \ >> + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ >> + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ >> + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ >> + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ >> + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ >> + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ >> + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ >> + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ >> + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) >> + >> +signed int asi[1024] __attribute__ ((aligned (16))); >> +unsigned int aui[1024] __attribute__ ((aligned (16))); >> +signed long long asl[1024] __attribute__ ((aligned (16))); >> +unsigned long long aul[1024] __attribute__ ((aligned (16))); >> +float af[1024] __attribute__ ((aligned (16))); >> +double ad[1024] __attribute__ ((aligned (16))); >> + >> +ALL_LOOPS (si_si, aui, asi, TEST_LT) >> +ALL_LOOPS (ui_si, aui, asi, TEST_LE) >> +ALL_LOOPS (si_ui, aui, asi, TEST_GT) >> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) >> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) >> +ALL_LOOPS (ul_ul, aul, aul, TEST_NE) >> +ALL_LOOPS (si_f, asi, af, TEST_LE) >> +ALL_LOOPS (ui_f, aui, af, TEST_GT) >> +ALL_LOOPS (sl_d, asl, ad, TEST_GE) >> +ALL_LOOPS (ul_d, aul, ad, TEST_GT) >> + >> +/* { dg-final { scan-assembler-not "\tand\t" } } */ >> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ >> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ >> +/* { dg-final { scan-assembler "\tldr\tq" } } */ >> +/* { dg-final { scan-assembler "\tstr\tq" } } */ > > > -- > Marc Glisse
Richard Biener <richard.guenther@gmail.com> writes: > On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote: >> On Tue, 23 Jun 2015, Richard Sandiford wrote: >> >>> +/* Vector comparisons are defined to produce all-one or all-zero results. >>> */ >>> +(simplify >>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> + (convert @0))) >> >> >> I am trying to understand why the test tree_nop_conversion_p is the right >> one (at least for the transformations not using VIEW_CONVERT_EXPR). By >> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector >> types of the same size and number of elements. It thus seems like a >> conversion is always fine. For vectors, tree_nop_conversion_p apparently >> only checks that they have the same mode (quite often VOIDmode I guess). > > The only conversion we seem to allow is changing the signed vector from > the comparison result to an unsigned vector (same number of elements > and same mode of the elements). That is, a check using > TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably > be better (well, technically a TYPE_VECTOR_SUBPARTS && element > mode compare should be better as generic vectors might not have a vector mode). OK. The reason I was being paranoid was that I couldn't see anywhere where we enforced that the vector condition in a VEC_COND had to have the same element width as the values being selected. tree-cfg.c only checks that rhs2 and rhs3 are compatible with the result. There doesn't seem to be any checking of rhs1 vs. the other types. So I wasn't sure whether anything stopped us from, e.g., comparing two V4HIs and using the result to select between two V4SIs. > I'm fine with using tree_nop_conversion_p for now. I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element mode. How about: (if (VECTOR_INTEGER_TYPE_P (type) && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) && (TYPE_MODE (TREE_TYPE (type)) == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) (But is it really OK to be adding more mode-based compatibility checks? I thought you were hoping to move away from modes in the middle end.) >>> +/* We could instead convert all instances of the vec_cond to negate, >>> + but that isn't necessarily a win on its own. */ > > so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks more compact > at least ;) It would also simplify the patterns below. In the past I've dealt with processors where arithmetic wasn't handled as efficiently as logical ops. Seems like an especial risk for 64-bit elements, from a quick scan of the i386 scheduling models. > I'm missing a comment on the transform done by the following patterns. Heh. The comment was supposed to be describing all four at once. I originally had then bunched together without whitespace, but it looked bad. >>> +(simplify >>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> + (minus @3 (convert @0)))) >>> + >>> +(simplify >>> + (plus:c @3 (view_convert_expr >> >> >> Aren't we suppose to drop _expr in match.pd? > > Yes. I probably should adjust genmatch.c to reject the _expr variants ;) OK. >>> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> + (minus @3 (convert @0)))) >>> + >>> +(simplify >>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> + (plus @3 (convert @0)))) >>> + >>> +(simplify >>> + (minus @3 (view_convert_expr >>> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> + (plus @3 (convert @0)))) >>> + > > Generally for sign-conversions of vectors you should use view_convert. OK. > The above also hints at missing conditional view_convert support > and a way to iterate over commutative vs. non-commutative ops so > we could write > > (for op (plus:c minus) > rop (minus plus) > (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > (rop @3 (view_convert @0))))) > > I'll see implementing that. Looks good. :-) I also realised later that: /* Vector comparisons are defined to produce all-one or all-zero results. */ (simplify (vec_cond @0 integer_all_onesp@1 integer_zerop@2) (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) (convert @0))) is redundant with some fold-const.c code. Thanks, Richard
On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford <richard.sandiford@arm.com> wrote: > Richard Biener <richard.guenther@gmail.com> writes: >> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote: >>> On Tue, 23 Jun 2015, Richard Sandiford wrote: >>> >>>> +/* Vector comparisons are defined to produce all-one or all-zero results. >>>> */ >>>> +(simplify >>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>> + (convert @0))) >>> >>> >>> I am trying to understand why the test tree_nop_conversion_p is the right >>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By >>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector >>> types of the same size and number of elements. It thus seems like a >>> conversion is always fine. For vectors, tree_nop_conversion_p apparently >>> only checks that they have the same mode (quite often VOIDmode I guess). >> >> The only conversion we seem to allow is changing the signed vector from >> the comparison result to an unsigned vector (same number of elements >> and same mode of the elements). That is, a check using >> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably >> be better (well, technically a TYPE_VECTOR_SUBPARTS && element >> mode compare should be better as generic vectors might not have a vector mode). > > OK. The reason I was being paranoid was that I couldn't see anywhere > where we enforced that the vector condition in a VEC_COND had to have > the same element width as the values being selected. We don't require that indeed. > tree-cfg.c > only checks that rhs2 and rhs3 are compatible with the result. > There doesn't seem to be any checking of rhs1 vs. the other types. > So I wasn't sure whether anything stopped us from, e.g., comparing two > V4HIs and using the result to select between two V4SIs. Nothing does (or should). >> I'm fine with using tree_nop_conversion_p for now. > > I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element > mode. How about: > > (if (VECTOR_INTEGER_TYPE_P (type) > && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) > && (TYPE_MODE (TREE_TYPE (type)) > == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) > > (But is it really OK to be adding more mode-based compatibility checks? > I thought you were hoping to move away from modes in the middle end.) The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant (the type of a comparison is always a signed vector integer type). Yes, mode-based checks are ok. I don't see us moving away from them. >>>> +/* We could instead convert all instances of the vec_cond to negate, >>>> + but that isn't necessarily a win on its own. */ >> >> so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks more compact >> at least ;) It would also simplify the patterns below. > > In the past I've dealt with processors where arithmetic wasn't handled > as efficiently as logical ops. Seems like an especial risk for 64-bit > elements, from a quick scan of the i386 scheduling models. But then expansion could undo this ... >> I'm missing a comment on the transform done by the following patterns. > > Heh. The comment was supposed to be describing all four at once. > I originally had then bunched together without whitespace, but it > looked bad. > >>>> +(simplify >>>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>> + (minus @3 (convert @0)))) >>>> + >>>> +(simplify >>>> + (plus:c @3 (view_convert_expr >>> >>> >>> Aren't we suppose to drop _expr in match.pd? >> >> Yes. I probably should adjust genmatch.c to reject the _expr variants ;) > > OK. > >>>> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>> + (minus @3 (convert @0)))) >>>> + >>>> +(simplify >>>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) >>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>> + (plus @3 (convert @0)))) >>>> + >>>> +(simplify >>>> + (minus @3 (view_convert_expr >>>> + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>> + (plus @3 (convert @0)))) >>>> + >> >> Generally for sign-conversions of vectors you should use view_convert. > > OK. > >> The above also hints at missing conditional view_convert support >> and a way to iterate over commutative vs. non-commutative ops so >> we could write >> >> (for op (plus:c minus) >> rop (minus plus) >> (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) >> (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> (rop @3 (view_convert @0))))) >> >> I'll see implementing that. > > Looks good. :-) > > I also realised later that: > > /* Vector comparisons are defined to produce all-one or all-zero results. */ > (simplify > (vec_cond @0 integer_all_onesp@1 integer_zerop@2) > (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) > (convert @0))) > > is redundant with some fold-const.c code. If so then you should remove the fold-const.c at the time you add the pattern. Note that ISTR code performing exactly the opposite transform in fold-const.c ... Richard. > Thanks, > Richard >
Richard Biener <richard.guenther@gmail.com> writes: >>> I'm fine with using tree_nop_conversion_p for now. >> >> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element >> mode. How about: >> >> (if (VECTOR_INTEGER_TYPE_P (type) >> && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) >> && (TYPE_MODE (TREE_TYPE (type)) >> == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) >> >> (But is it really OK to be adding more mode-based compatibility checks? >> I thought you were hoping to move away from modes in the middle end.) > > The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant > (the type of a comparison is always a signed vector integer type). OK, will just use VECTOR_TYPE_P then. >>>>> +/* We could instead convert all instances of the vec_cond to negate, >>>>> + but that isn't necessarily a win on its own. */ >>> >>> so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks more compact >>> at least ;) It would also simplify the patterns below. >> >> In the past I've dealt with processors where arithmetic wasn't handled >> as efficiently as logical ops. Seems like an especial risk for 64-bit >> elements, from a quick scan of the i386 scheduling models. > > But then expansion could undo this ... So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)? Is there precendent for doing that kind of thing? >> I also realised later that: >> >> /* Vector comparisons are defined to produce all-one or all-zero results. */ >> (simplify >> (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >> (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >> (convert @0))) >> >> is redundant with some fold-const.c code. > > If so then you should remove the fold-const.c at the time you add the pattern. Can I just drop that part of the patch instead? The fold-const.c code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have to move COND_EXPR at the same time. And then the natural follow-on would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-) > Note that ISTR code performing exactly the opposite transform in > fold-const.c ... That's another reason why I'm worried about just doing the (negate ...) thing without knowing whether the negate can be folded into anything else. Thanks, Richard
On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford <richard.sandiford@arm.com> wrote: > Richard Biener <richard.guenther@gmail.com> writes: >>>> I'm fine with using tree_nop_conversion_p for now. >>> >>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element >>> mode. How about: >>> >>> (if (VECTOR_INTEGER_TYPE_P (type) >>> && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) >>> && (TYPE_MODE (TREE_TYPE (type)) >>> == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) >>> >>> (But is it really OK to be adding more mode-based compatibility checks? >>> I thought you were hoping to move away from modes in the middle end.) >> >> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant >> (the type of a comparison is always a signed vector integer type). > > OK, will just use VECTOR_TYPE_P then. Given we're in a VEC_COND_EXPR that's redundant as well. >>>>>> +/* We could instead convert all instances of the vec_cond to negate, >>>>>> + but that isn't necessarily a win on its own. */ >>>> >>>> so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks more compact >>>> at least ;) It would also simplify the patterns below. >>> >>> In the past I've dealt with processors where arithmetic wasn't handled >>> as efficiently as logical ops. Seems like an especial risk for 64-bit >>> elements, from a quick scan of the i386 scheduling models. >> >> But then expansion could undo this ... > > So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)? > Is there precendent for doing that kind of thing? Expanding it as this, yes. Whether there is precedence no idea, but surely the expand_unop path could, if there is no optab for neg:vector_mode, try expanding as vec_cond .. 1 0. There is precedence for different expansion paths dependent on optabs (or even rtx cost?). Of course expand_unop doesn't get the original tree ops (expand_expr.c does, where some special-casing using get_gimple_for_expr is). Not sure if expand_unop would get 'cond' in a form where it can recognize the result is either -1 or 0. >>> I also realised later that: >>> >>> /* Vector comparisons are defined to produce all-one or all-zero results. */ >>> (simplify >>> (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >>> (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>> (convert @0))) >>> >>> is redundant with some fold-const.c code. >> >> If so then you should remove the fold-const.c at the time you add the pattern. > > Can I just drop that part of the patch instead? The fold-const.c > code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have > to move COND_EXPR at the same time. And then the natural follow-on > would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-) Yes, why not? ;) But sure, you can also drop the case for now. >> Note that ISTR code performing exactly the opposite transform in >> fold-const.c ... > > That's another reason why I'm worried about just doing the (negate ...) > thing without knowing whether the negate can be folded into anything else. I'm not aware of anything here. Richard. > Thanks, > Richard >
Richard Biener <richard.guenther@gmail.com> writes: > On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> Richard Biener <richard.guenther@gmail.com> writes: >>>>> I'm fine with using tree_nop_conversion_p for now. >>>> >>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element >>>> mode. How about: >>>> >>>> (if (VECTOR_INTEGER_TYPE_P (type) >>>> && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) >>>> && (TYPE_MODE (TREE_TYPE (type)) >>>> == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) >>>> >>>> (But is it really OK to be adding more mode-based compatibility checks? >>>> I thought you were hoping to move away from modes in the middle end.) >>> >>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant >>> (the type of a comparison is always a signed vector integer type). >> >> OK, will just use VECTOR_TYPE_P then. > > Given we're in a VEC_COND_EXPR that's redundant as well. Hmm, but is it really guaranteed in: (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2))) that the @3 and the view_convert are also vectors? I thought we allowed view_converts from vector to non-vector types. >>>>>>> +/* We could instead convert all instances of the vec_cond to negate, >>>>>>> + but that isn't necessarily a win on its own. */ >>>>> >>>>> so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks >>>>> more compact >>>>> at least ;) It would also simplify the patterns below. >>>> >>>> In the past I've dealt with processors where arithmetic wasn't handled >>>> as efficiently as logical ops. Seems like an especial risk for 64-bit >>>> elements, from a quick scan of the i386 scheduling models. >>> >>> But then expansion could undo this ... >> >> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)? >> Is there precendent for doing that kind of thing? > > Expanding it as this, yes. Whether there is precedence no idea, but > surely the expand_unop path could, if there is no optab for neg:vector_mode, > try expanding as vec_cond .. 1 0. Yeah, that part isn't the problem. It's when there is an implementation of (neg ...) (which I'd hope all real integer vector architectures would support) but it's not as efficient as the (and ...) that most targets would use for a (vec_cond ... 0). > There is precedence for different > expansion paths dependent on optabs (or even rtx cost?). Of course > expand_unop doesn't get the original tree ops (expand_expr.c does, > where some special-casing using get_gimple_for_expr is). Not sure > if expand_unop would get 'cond' in a form where it can recognize > the result is either -1 or 0. It just seems inconsistent to have the optabs machinery try to detect this ad-hoc combination opportunity while still leaving the vcond optab to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0). The vcond optabs would still have the logic needed to produce the right code, but we'd be circumventing it and trying to reimplement one particular case in a different way. Thanks, Richard
On Wed, Jun 24, 2015 at 2:28 PM, Richard Sandiford <richard.sandiford@arm.com> wrote: > Richard Biener <richard.guenther@gmail.com> writes: >> On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford >> <richard.sandiford@arm.com> wrote: >>> Richard Biener <richard.guenther@gmail.com> writes: >>>>>> I'm fine with using tree_nop_conversion_p for now. >>>>> >>>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element >>>>> mode. How about: >>>>> >>>>> (if (VECTOR_INTEGER_TYPE_P (type) >>>>> && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) >>>>> && (TYPE_MODE (TREE_TYPE (type)) >>>>> == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) >>>>> >>>>> (But is it really OK to be adding more mode-based compatibility checks? >>>>> I thought you were hoping to move away from modes in the middle end.) >>>> >>>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant >>>> (the type of a comparison is always a signed vector integer type). >>> >>> OK, will just use VECTOR_TYPE_P then. >> >> Given we're in a VEC_COND_EXPR that's redundant as well. > > Hmm, but is it really guaranteed in: > > (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > > that the @3 and the view_convert are also vectors? I thought we allowed > view_converts from vector to non-vector types. Hmm, true. >>>>>>>> +/* We could instead convert all instances of the vec_cond to negate, >>>>>>>> + but that isn't necessarily a win on its own. */ >>>>>> >>>>>> so p ? 1 : 0 -> -p? Why isn't that a win on its own? It looks >>>>>> more compact >>>>>> at least ;) It would also simplify the patterns below. >>>>> >>>>> In the past I've dealt with processors where arithmetic wasn't handled >>>>> as efficiently as logical ops. Seems like an especial risk for 64-bit >>>>> elements, from a quick scan of the i386 scheduling models. >>>> >>>> But then expansion could undo this ... >>> >>> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)? >>> Is there precendent for doing that kind of thing? >> >> Expanding it as this, yes. Whether there is precedence no idea, but >> surely the expand_unop path could, if there is no optab for neg:vector_mode, >> try expanding as vec_cond .. 1 0. > > Yeah, that part isn't the problem. It's when there is an implementation > of (neg ...) (which I'd hope all real integer vector architectures would > support) but it's not as efficient as the (and ...) that most targets > would use for a (vec_cond ... 0). I would suppose that a single-operand op (neg) is always bettter than a two-operand (and) one. But you of course never know... >> There is precedence for different >> expansion paths dependent on optabs (or even rtx cost?). Of course >> expand_unop doesn't get the original tree ops (expand_expr.c does, >> where some special-casing using get_gimple_for_expr is). Not sure >> if expand_unop would get 'cond' in a form where it can recognize >> the result is either -1 or 0. > > It just seems inconsistent to have the optabs machinery try to detect > this ad-hoc combination opportunity while still leaving the vcond optab > to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0). > The vcond optabs would still have the logic needed to produce the > right code, but we'd be circumventing it and trying to reimplement > one particular case in a different way. That's true. One could also leave it to combine / simplify_rtx and thus rtx_cost. But that's true of all of the match.pd stuff you add, no? Richard. > Thanks, > Richard >
>>> There is precedence for different >>> expansion paths dependent on optabs (or even rtx cost?). Of course >>> expand_unop doesn't get the original tree ops (expand_expr.c does, >>> where some special-casing using get_gimple_for_expr is). Not sure >>> if expand_unop would get 'cond' in a form where it can recognize >>> the result is either -1 or 0. >> >> It just seems inconsistent to have the optabs machinery try to detect >> this ad-hoc combination opportunity while still leaving the vcond optab >> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0). >> The vcond optabs would still have the logic needed to produce the >> right code, but we'd be circumventing it and trying to reimplement >> one particular case in a different way. > > That's true. One could also leave it to combine / simplify_rtx and > thus rtx_cost. But that's true of all of the match.pd stuff you add, no? It's probably true of most match.pd stuff in general though :-) One advantage of match.pd of course is that it works across block boundaries. The difference between the stuff I added and converting vec_cond_expr to negate is that the stuff I added avoids the vec_cond_expr altogether and so ought to be an unequivocal win. Replacing vec_cond_expr with negate just rewrites it into another (arguably more surprising) form. Thanks, Richard
On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford <richard.sandiford@arm.com> wrote: >>>> There is precedence for different >>>> expansion paths dependent on optabs (or even rtx cost?). Of course >>>> expand_unop doesn't get the original tree ops (expand_expr.c does, >>>> where some special-casing using get_gimple_for_expr is). Not sure >>>> if expand_unop would get 'cond' in a form where it can recognize >>>> the result is either -1 or 0. >>> >>> It just seems inconsistent to have the optabs machinery try to detect >>> this ad-hoc combination opportunity while still leaving the vcond optab >>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0). >>> The vcond optabs would still have the logic needed to produce the >>> right code, but we'd be circumventing it and trying to reimplement >>> one particular case in a different way. >> >> That's true. One could also leave it to combine / simplify_rtx and >> thus rtx_cost. But that's true of all of the match.pd stuff you add, no? > > It's probably true of most match.pd stuff in general though :-) > One advantage of match.pd of course is that it works across > block boundaries. > > The difference between the stuff I added and converting vec_cond_expr > to negate is that the stuff I added avoids the vec_cond_expr altogether > and so ought to be an unequivocal win. Replacing vec_cond_expr with > negate just rewrites it into another (arguably more surprising) form. True. Btw, conditional view_convert is now in trunk so you can at least merge both plus:c patterns and both minus patterns. Richard. > Thanks, > Richard >
On 06/24/2015 05:43 AM, Richard Biener wrote: >>> Note that ISTR code performing exactly the opposite transform in >>> fold-const.c ... >> >> That's another reason why I'm worried about just doing the (negate ...) >> thing without knowing whether the negate can be folded into anything else. > > I'm not aware of anything here. It's worth looking at -- I've certainly seen cases where we end up infinite recursion because we've got a transformation in once place (say match.pd) and its inverse elsewhere (fold-const.c). Jeff
On Wed, 24 Jun 2015, Richard Biener wrote: > On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford > <richard.sandiford@arm.com> wrote: >> Richard Biener <richard.guenther@gmail.com> writes: >>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote: >>>> On Tue, 23 Jun 2015, Richard Sandiford wrote: >>>> >>>>> +/* Vector comparisons are defined to produce all-one or all-zero results. >>>>> */ >>>>> +(simplify >>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>>> + (convert @0))) >>>> >>>> >>>> I am trying to understand why the test tree_nop_conversion_p is the right >>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By >>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector >>>> types of the same size and number of elements. It thus seems like a >>>> conversion is always fine. For vectors, tree_nop_conversion_p apparently >>>> only checks that they have the same mode (quite often VOIDmode I guess). >>> >>> The only conversion we seem to allow is changing the signed vector from >>> the comparison result to an unsigned vector (same number of elements >>> and same mode of the elements). That is, a check using >>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably >>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element >>> mode compare should be better as generic vectors might not have a vector mode). >> >> OK. The reason I was being paranoid was that I couldn't see anywhere >> where we enforced that the vector condition in a VEC_COND had to have >> the same element width as the values being selected. > > We don't require that indeed. > >> tree-cfg.c >> only checks that rhs2 and rhs3 are compatible with the result. >> There doesn't seem to be any checking of rhs1 vs. the other types. >> So I wasn't sure whether anything stopped us from, e.g., comparing two >> V4HIs and using the result to select between two V4SIs. > > Nothing does (or should). The documentation patch you approved in https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something different. If it is really wrong, could you fix it?
On Fri, Jun 26, 2015 at 12:39 AM, Marc Glisse <marc.glisse@inria.fr> wrote: > On Wed, 24 Jun 2015, Richard Biener wrote: > >> On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford >> <richard.sandiford@arm.com> wrote: >>> >>> Richard Biener <richard.guenther@gmail.com> writes: >>>> >>>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> >>>> wrote: >>>>> >>>>> On Tue, 23 Jun 2015, Richard Sandiford wrote: >>>>> >>>>>> +/* Vector comparisons are defined to produce all-one or all-zero >>>>>> results. >>>>>> */ >>>>>> +(simplify >>>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) >>>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) >>>>>> + (convert @0))) >>>>> >>>>> >>>>> >>>>> I am trying to understand why the test tree_nop_conversion_p is the >>>>> right >>>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By >>>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer >>>>> vector >>>>> types of the same size and number of elements. It thus seems like a >>>>> conversion is always fine. For vectors, tree_nop_conversion_p >>>>> apparently >>>>> only checks that they have the same mode (quite often VOIDmode I >>>>> guess). >>>> >>>> >>>> The only conversion we seem to allow is changing the signed vector from >>>> the comparison result to an unsigned vector (same number of elements >>>> and same mode of the elements). That is, a check using >>>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably >>>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element >>>> mode compare should be better as generic vectors might not have a vector >>>> mode). >>> >>> >>> OK. The reason I was being paranoid was that I couldn't see anywhere >>> where we enforced that the vector condition in a VEC_COND had to have >>> the same element width as the values being selected. >> >> >> We don't require that indeed. >> >>> tree-cfg.c >>> only checks that rhs2 and rhs3 are compatible with the result. >>> There doesn't seem to be any checking of rhs1 vs. the other types. >>> So I wasn't sure whether anything stopped us from, e.g., comparing two >>> V4HIs and using the result to select between two V4SIs. >> >> >> Nothing does (or should). > > > The documentation patch you approved in > https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something > different. If it is really wrong, could you fix it? Hmm, that simplifies things. It would be nice if these constraints would also be checked in the gimple verifier... Richard. > -- > Marc Glisse
Index: gcc/match.pd =================================================================== --- gcc/match.pd 2015-06-23 11:42:23.644645975 +0100 +++ gcc/match.pd 2015-06-23 11:42:23.760644655 +0100 @@ -973,6 +973,36 @@ along with GCC; see the file COPYING3. (cnd @0 @2 @1))) +/* Vector comparisons are defined to produce all-one or all-zero results. */ +(simplify + (vec_cond @0 integer_all_onesp@1 integer_zerop@2) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (convert @0))) + +/* We could instead convert all instances of the vec_cond to negate, + but that isn't necessarily a win on its own. */ +(simplify + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (minus @3 (convert @0)))) + +(simplify + (plus:c @3 (view_convert_expr + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (minus @3 (convert @0)))) + +(simplify + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2)) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (plus @3 (convert @0)))) + +(simplify + (minus @3 (view_convert_expr + (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (tree_nop_conversion_p (type, TREE_TYPE (@0))) + (plus @3 (convert @0)))) + /* Simplifications of comparisons. */ /* We can simplify a logical negation of a comparison to the Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c =================================================================== --- /dev/null 2015-06-02 17:27:28.541944012 +0100 +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-23 12:06:27.120203685 +0100 @@ -0,0 +1,94 @@ +/* Make sure that vector comaprison results are not unnecessarily ANDed + with vectors of 1. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#define COUNT1(X) if (X) count += 1 +#define COUNT2(X) if (X) count -= 1 +#define COUNT3(X) count += (X) +#define COUNT4(X) count -= (X) + +#define COND1(X) (X) +#define COND2(X) ((X) ? 1 : 0) +#define COND3(X) ((X) ? -1 : 0) +#define COND4(X) ((X) ? 0 : 1) +#define COND5(X) ((X) ? 0 : -1) + +#define TEST_LT(X, Y) ((X) < (Y)) +#define TEST_LE(X, Y) ((X) <= (Y)) +#define TEST_GT(X, Y) ((X) > (Y)) +#define TEST_GE(X, Y) ((X) >= (Y)) +#define TEST_EQ(X, Y) ((X) == (Y)) +#define TEST_NE(X, Y) ((X) != (Y)) + +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ + TYPE \ + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + TYPE count = 0; \ + for (unsigned int i = 0; i < 1024; ++i) \ + COUNT (TEST (CMP_ARRAY[i], x)); \ + return count; \ + } + +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ + void \ + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + plusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ + } \ + void \ + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + minusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ + } + +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ + typedef __typeof__(ARRAY[0]) ID##_type; \ + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) + +signed int asi[1024] __attribute__ ((aligned (16))); +unsigned int aui[1024] __attribute__ ((aligned (16))); +signed long long asl[1024] __attribute__ ((aligned (16))); +unsigned long long aul[1024] __attribute__ ((aligned (16))); +float af[1024] __attribute__ ((aligned (16))); +double ad[1024] __attribute__ ((aligned (16))); + +ALL_LOOPS (si_si, aui, asi, TEST_LT) +ALL_LOOPS (ui_si, aui, asi, TEST_LE) +ALL_LOOPS (si_ui, aui, asi, TEST_GT) +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) +ALL_LOOPS (ul_ul, aul, aul, TEST_NE) +ALL_LOOPS (si_f, asi, af, TEST_LE) +ALL_LOOPS (ui_f, aui, af, TEST_GT) +ALL_LOOPS (sl_d, asl, ad, TEST_GE) +ALL_LOOPS (ul_d, aul, ad, TEST_GT) + +/* { dg-final { scan-assembler-not "\tand\t" } } */ +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler "\tldr\tq" } } */ +/* { dg-final { scan-assembler "\tstr\tq" } } */