Message ID | 87pp4kqk4l.fsf@e105548-lin.cambridge.arm.com |
---|---|
State | New |
Headers | show |
On Thu, Jun 25, 2015 at 10:15 AM, Richard Sandiford <richard.sandiford@arm.com> wrote: > Richard Biener <richard.guenther@gmail.com> writes: >> On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford >> <richard.sandiford@arm.com> wrote: >>>>>> There is precedence for different >>>>>> expansion paths dependent on optabs (or even rtx cost?). Of course >>>>>> expand_unop doesn't get the original tree ops (expand_expr.c does, >>>>>> where some special-casing using get_gimple_for_expr is). Not sure >>>>>> if expand_unop would get 'cond' in a form where it can recognize >>>>>> the result is either -1 or 0. >>>>> >>>>> It just seems inconsistent to have the optabs machinery try to detect >>>>> this ad-hoc combination opportunity while still leaving the vcond optab >>>>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0). >>>>> The vcond optabs would still have the logic needed to produce the >>>>> right code, but we'd be circumventing it and trying to reimplement >>>>> one particular case in a different way. >>>> >>>> That's true. One could also leave it to combine / simplify_rtx and >>>> thus rtx_cost. But that's true of all of the match.pd stuff you add, no? >>> >>> It's probably true of most match.pd stuff in general though :-) >>> One advantage of match.pd of course is that it works across >>> block boundaries. >>> >>> The difference between the stuff I added and converting vec_cond_expr >>> to negate is that the stuff I added avoids the vec_cond_expr altogether >>> and so ought to be an unequivocal win. Replacing vec_cond_expr with >>> negate just rewrites it into another (arguably more surprising) form. >> >> True. Btw, conditional view_convert is now in trunk so you can at least >> merge both plus:c patterns and both minus patterns. > > Thanks for adding that. How does this version look? Retested on > x86_64-linux-gnu and aarch64-elf. > > Richard > > > gcc/ > * match.pd: Add patterns for vec_conds between 1 and 0. > > gcc/testsuite/ > * gcc.target/aarch64/vect-add-sub-cond.c: New test. > > Index: gcc/match.pd > =================================================================== > --- gcc/match.pd 2015-06-24 20:24:31.344998571 +0100 > +++ gcc/match.pd 2015-06-24 20:24:31.340998617 +0100 > @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3. > (cnd (logical_inverted_value truth_valued_p@0) @1 @2) > (cnd @0 @2 @1))) > > +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons > + return all-1 or all-0 results. */ > +/* ??? We could instead convert all instances of the vec_cond to negate, > + but that isn't necessarily a win on its own. */ > +(simplify > + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > + (if (VECTOR_TYPE_P (type) > + && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) > + && (TYPE_MODE (TREE_TYPE (type)) > + == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) > + (minus @3 (view_convert @0)))) > + > +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C). */ > +(simplify > + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) > + (if (VECTOR_TYPE_P (type) > + && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) > + && (TYPE_PRECISION (TREE_TYPE (type)) > + == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0))))) Either TYPE_PRECISION or TYPE_MODE please ;) I think that TYPE_MODE is more correct if you consider (minus V4SF (view_convert:V4SF (vec_cond V4SI V4SI V4SI)) where you would end up with a non-sensical TYPE_PRECISION query on V4SF. So probably VECTOR_INTEGER_TYPE_P again, then TYPE_PRECISION is good. Ok with that change. Thanks, Richard. > + (plus @3 (view_convert @0)))) > > /* Simplifications of comparisons. */ > > Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c > =================================================================== > --- /dev/null 2015-06-02 17:27:28.541944012 +0100 > +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-24 20:24:31.340998617 +0100 > @@ -0,0 +1,94 @@ > +/* Make sure that vector comaprison results are not unnecessarily ANDed > + with vectors of 1. */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +#define COUNT1(X) if (X) count += 1 > +#define COUNT2(X) if (X) count -= 1 > +#define COUNT3(X) count += (X) > +#define COUNT4(X) count -= (X) > + > +#define COND1(X) (X) > +#define COND2(X) ((X) ? 1 : 0) > +#define COND3(X) ((X) ? -1 : 0) > +#define COND4(X) ((X) ? 0 : 1) > +#define COND5(X) ((X) ? 0 : -1) > + > +#define TEST_LT(X, Y) ((X) < (Y)) > +#define TEST_LE(X, Y) ((X) <= (Y)) > +#define TEST_GT(X, Y) ((X) > (Y)) > +#define TEST_GE(X, Y) ((X) >= (Y)) > +#define TEST_EQ(X, Y) ((X) == (Y)) > +#define TEST_NE(X, Y) ((X) != (Y)) > + > +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ > + TYPE \ > + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + TYPE count = 0; \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + COUNT (TEST (CMP_ARRAY[i], x)); \ > + return count; \ > + } > + > +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ > + void \ > + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ > + } \ > + void \ > + plusc_##ID (void) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ > + } \ > + void \ > + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ > + } \ > + void \ > + minusc_##ID (void) \ > + { \ > + for (unsigned int i = 0; i < 1024; ++i) \ > + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ > + } > + > +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ > + typedef __typeof__(ARRAY[0]) ID##_type; \ > + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ > + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ > + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ > + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ > + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ > + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ > + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ > + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ > + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) > + > +signed int asi[1024] __attribute__ ((aligned (16))); > +unsigned int aui[1024] __attribute__ ((aligned (16))); > +signed long long asl[1024] __attribute__ ((aligned (16))); > +unsigned long long aul[1024] __attribute__ ((aligned (16))); > +float af[1024] __attribute__ ((aligned (16))); > +double ad[1024] __attribute__ ((aligned (16))); > + > +ALL_LOOPS (si_si, aui, asi, TEST_LT) > +ALL_LOOPS (ui_si, aui, asi, TEST_LE) > +ALL_LOOPS (si_ui, aui, asi, TEST_GT) > +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) > +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) > +ALL_LOOPS (ul_ul, aul, aul, TEST_EQ) > +ALL_LOOPS (si_f, asi, af, TEST_LE) > +ALL_LOOPS (ui_f, aui, af, TEST_GT) > +ALL_LOOPS (sl_d, asl, ad, TEST_GE) > +ALL_LOOPS (ul_d, aul, ad, TEST_GT) > + > +/* { dg-final { scan-assembler-not "\tand\t" } } */ > +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ > +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ > +/* { dg-final { scan-assembler "\tldr\tq" } } */ > +/* { dg-final { scan-assembler "\tstr\tq" } } */ >
Index: gcc/match.pd =================================================================== --- gcc/match.pd 2015-06-24 20:24:31.344998571 +0100 +++ gcc/match.pd 2015-06-24 20:24:31.340998617 +0100 @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3. (cnd (logical_inverted_value truth_valued_p@0) @1 @2) (cnd @0 @2 @1))) +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons + return all-1 or all-0 results. */ +/* ??? We could instead convert all instances of the vec_cond to negate, + but that isn't necessarily a win on its own. */ +(simplify + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (VECTOR_TYPE_P (type) + && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) + && (TYPE_MODE (TREE_TYPE (type)) + == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0))))) + (minus @3 (view_convert @0)))) + +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C). */ +(simplify + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2))) + (if (VECTOR_TYPE_P (type) + && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0)) + && (TYPE_PRECISION (TREE_TYPE (type)) + == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0))))) + (plus @3 (view_convert @0)))) /* Simplifications of comparisons. */ Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c =================================================================== --- /dev/null 2015-06-02 17:27:28.541944012 +0100 +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c 2015-06-24 20:24:31.340998617 +0100 @@ -0,0 +1,94 @@ +/* Make sure that vector comaprison results are not unnecessarily ANDed + with vectors of 1. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#define COUNT1(X) if (X) count += 1 +#define COUNT2(X) if (X) count -= 1 +#define COUNT3(X) count += (X) +#define COUNT4(X) count -= (X) + +#define COND1(X) (X) +#define COND2(X) ((X) ? 1 : 0) +#define COND3(X) ((X) ? -1 : 0) +#define COND4(X) ((X) ? 0 : 1) +#define COND5(X) ((X) ? 0 : -1) + +#define TEST_LT(X, Y) ((X) < (Y)) +#define TEST_LE(X, Y) ((X) <= (Y)) +#define TEST_GT(X, Y) ((X) > (Y)) +#define TEST_GE(X, Y) ((X) >= (Y)) +#define TEST_EQ(X, Y) ((X) == (Y)) +#define TEST_NE(X, Y) ((X) != (Y)) + +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \ + TYPE \ + reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + TYPE count = 0; \ + for (unsigned int i = 0; i < 1024; ++i) \ + COUNT (TEST (CMP_ARRAY[i], x)); \ + return count; \ + } + +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \ + void \ + plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + plusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \ + } \ + void \ + minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \ + } \ + void \ + minusc_##ID (void) \ + { \ + for (unsigned int i = 0; i < 1024; ++i) \ + ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \ + } + +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \ + typedef __typeof__(ARRAY[0]) ID##_type; \ + COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \ + COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \ + COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \ + COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \ + COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \ + COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \ + COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \ + COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \ + COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5) + +signed int asi[1024] __attribute__ ((aligned (16))); +unsigned int aui[1024] __attribute__ ((aligned (16))); +signed long long asl[1024] __attribute__ ((aligned (16))); +unsigned long long aul[1024] __attribute__ ((aligned (16))); +float af[1024] __attribute__ ((aligned (16))); +double ad[1024] __attribute__ ((aligned (16))); + +ALL_LOOPS (si_si, aui, asi, TEST_LT) +ALL_LOOPS (ui_si, aui, asi, TEST_LE) +ALL_LOOPS (si_ui, aui, asi, TEST_GT) +ALL_LOOPS (ui_ui, aui, asi, TEST_GE) +ALL_LOOPS (sl_sl, asl, asl, TEST_NE) +ALL_LOOPS (ul_ul, aul, aul, TEST_EQ) +ALL_LOOPS (si_f, asi, af, TEST_LE) +ALL_LOOPS (ui_f, aui, af, TEST_GT) +ALL_LOOPS (sl_d, asl, ad, TEST_GE) +ALL_LOOPS (ul_d, aul, ad, TEST_GT) + +/* { dg-final { scan-assembler-not "\tand\t" } } */ +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */ +/* { dg-final { scan-assembler "\tldr\tq" } } */ +/* { dg-final { scan-assembler "\tstr\tq" } } */