Message ID | CAOvf_xyFZMQusRaXp1FP-MTVKbeMZfm-8EzOgMQvEdOhBwUHaA@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Tue, 6 May 2014, Evgeny Stupachenko wrote: > The patch on cost model was successfully committed. > I've separated the rest part of the patch on loads/stores group into > 2: on loads group and on stores group. > Below is first part on loads group. > > Bootstrap and make check passed on x86. > > Is it ok? > > ChangeLog: > > 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> > > * tree-vect-data-refs.c (vect_grouped_load_supported): New > check for loads group of length 3. > (vect_permute_load_chain): New permutations for loads group of > length 3. > * tree-vect-stmts.c (vect_model_load_cost): Change cost > of vec_perm_shuffle for the new permutations. > > ChangeLog for testsuite: > > 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> > > PR tree-optimization/52252 > * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > index 274cdbd..feafb38 100644 > --- a/gcc/tree-vect-data-refs.c > +++ b/gcc/tree-vect-data-refs.c > @@ -4812,36 +4812,74 @@ vect_grouped_load_supported (tree vectype, > unsigned HOST_WIDE_INT count) > { > enum machine_mode mode = TYPE_MODE (vectype); > > - /* vect_permute_load_chain requires the group size to be a power of two. */ > - if (exact_log2 (count) == -1) > + /* vect_permute_load_chain requires the group size to be equal to 3 or > + be a power of two. */ > + if (count != 3 && exact_log2 (count) == -1) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "the size of the group of accesses" > - " is not a power of 2\n"); > + "the size of the group of accesses" > + " is not a power of 2 or not eqaul to 3\n"); equal > return false; > } > > /* Check that the permutation is supported. */ > if (VECTOR_MODE_P (mode)) > { > - unsigned int i, nelt = GET_MODE_NUNITS (mode); > + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); > unsigned char *sel = XALLOCAVEC (unsigned char, nelt); > > - for (i = 0; i < nelt; i++) > - sel[i] = i * 2; > - if (can_vec_perm_p (mode, false, sel)) > + if (exact_log2 (count) != -1) > { > for (i = 0; i < nelt; i++) > - sel[i] = i * 2 + 1; > + sel[i] = i * 2; > if (can_vec_perm_p (mode, false, sel)) > - return true; > + { > + for (i = 0; i < nelt; i++) > + sel[i] = i * 2 + 1; > + if (can_vec_perm_p (mode, false, sel)) > + return true; > + } > + } > + else if (count == 3) Please structure this if as having special cases first and then an else with gcc_assert (exact_log2 (count)). > + { > + unsigned int k; > + for (k = 0; k < 3; k++) > + { > + for (i = 0; i < nelt; i++) > + if (3 * i + k < 2 * nelt) > + sel[i] = 3 * i + k; > + else > + sel[i] = 0; > + if (!can_vec_perm_p (mode, false, sel)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "shuffle of 3 loads is not supported by \ > + target\n"); Don't use multi-line strings but do "shuffle of ..." "target\n"); instead. > + return false; > + } > + for (i = 0, j = 0; i < nelt; i++) > + if (3 * i + k < 2 * nelt) > + sel[i] = i; > + else > + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); > + if (!can_vec_perm_p (mode, false, sel)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "shuffle of 3 loads is not supported by \ > + target\n"); Likewise. > + return false; > + } > + } > + return true; > } > } > > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "extract even/odd not supported by target\n"); > + "extract even/odd not supported by target\n"); > return false; > } > > @@ -4859,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, > unsigned HOST_WIDE_INT count) > /* Function vect_permute_load_chain. > > Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be > - a power of 2, generate extract_even/odd stmts to reorder the input data > - correctly. Return the final references for loads in RESULT_CHAIN. > + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder > + the input data correctly. Return the final references for loads in > + RESULT_CHAIN. > > E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. > The input is 4 vectors each containing 8 elements. We assign a > number to each > @@ -4941,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, > { > tree data_ref, first_vect, second_vect; > tree perm_mask_even, perm_mask_odd; > + tree perm3_mask_low, perm3_mask_high; > gimple perm_stmt; > tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); > unsigned int i, j, log_length = exact_log2 (length); > @@ -4951,45 +4991,99 @@ vect_permute_load_chain (vec<tree> dr_chain, > memcpy (result_chain->address (), dr_chain.address (), > length * sizeof (tree)); > > - for (i = 0; i < nelt; ++i) > - sel[i] = i * 2; > - perm_mask_even = vect_gen_perm_mask (vectype, sel); > - gcc_assert (perm_mask_even != NULL); > + if (log_length != (unsigned int)-1) Same for the if-structure - first handle all special values and then in the else handle power-of-two cases. Ok with those changes. Thanks, Richard. > + { > + for (i = 0; i < nelt; ++i) > + sel[i] = i * 2; > + perm_mask_even = vect_gen_perm_mask (vectype, sel); > + gcc_assert (perm_mask_even != NULL); > > - for (i = 0; i < nelt; ++i) > - sel[i] = i * 2 + 1; > - perm_mask_odd = vect_gen_perm_mask (vectype, sel); > - gcc_assert (perm_mask_odd != NULL); > + for (i = 0; i < nelt; ++i) > + sel[i] = i * 2 + 1; > + perm_mask_odd = vect_gen_perm_mask (vectype, sel); > + gcc_assert (perm_mask_odd != NULL); > > - for (i = 0; i < log_length; i++) > - { > - for (j = 0; j < length; j += 2) > + for (i = 0; i < log_length; i++) > { > - first_vect = dr_chain[j]; > - second_vect = dr_chain[j+1]; > + for (j = 0; j < length; j += 2) > + { > + first_vect = dr_chain[j]; > + second_vect = dr_chain[j+1]; > + > + /* data_ref = permute_even (first_data_ref, second_data_ref); */ > + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); > + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > + first_vect, second_vect, > + perm_mask_even); > + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > + (*result_chain)[j/2] = data_ref; > + > + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ > + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); > + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > + first_vect, second_vect, > + perm_mask_odd); > + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > + (*result_chain)[j/2+length/2] = data_ref; > + } > + memcpy (dr_chain.address (), result_chain->address (), > + length * sizeof (tree)); > + } > + } > + /* length is not a power of 2. */ > + else > + { > + unsigned int k; > > - /* data_ref = permute_even (first_data_ref, second_data_ref); */ > - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); > + /* currently only length 3 is supported as most frequent case. */ > + gcc_assert (length == 3); > + > + for (k = 0; k < 3; k++) > + { > + for (i = 0; i < nelt; i++) > + if (3 * i + k < 2 * nelt) > + sel[i] = 3 * i + k; > + else > + sel[i] = 0; > + perm3_mask_low = vect_gen_perm_mask (vectype, sel); > + gcc_assert (perm3_mask_low != NULL); > + > + for (i = 0, j = 0; i < nelt; i++) > + if (3 * i + k < 2 * nelt) > + sel[i] = i; > + else > + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); > + > + perm3_mask_high = vect_gen_perm_mask (vectype, sel); > + gcc_assert (perm3_mask_high != NULL); > + > + first_vect = dr_chain[0]; > + second_vect = dr_chain[1]; > + > + /* Create interleaving stmt (low part of): > + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, > + ...}> */ > + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); > perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > first_vect, second_vect, > - perm_mask_even); > + perm3_mask_low); > vect_finish_stmt_generation (stmt, perm_stmt, gsi); > - (*result_chain)[j/2] = data_ref; > > - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ > - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); > + /* Create interleaving stmt (high part of): > + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, > + ...}> */ > + first_vect = data_ref; > + second_vect = dr_chain[2]; > + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); > perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > first_vect, second_vect, > - perm_mask_odd); > + perm3_mask_high); > vect_finish_stmt_generation (stmt, perm_stmt, gsi); > - (*result_chain)[j/2+length/2] = data_ref; > + (*result_chain)[k] = data_ref; > } > - memcpy (dr_chain.address (), result_chain->address (), > - length * sizeof (tree)); > } > } > > - > /* Function vect_transform_grouped_load. > > Given a chain of input interleaved data-refs (in DR_CHAIN), build statements > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 1a51d6d..b87c143 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, > int ncopies, > include the cost of the permutes. */ > if (!load_lanes_p && group_size > 1) > { > - /* Uses an even and odd extract operations for each needed permute. */ > - int nstmts = ncopies * exact_log2 (group_size) * group_size; > - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, > - stmt_info, 0, vect_body); > + /* Uses an even and odd extract operations or shuffle operations > + for each needed permute. */ > + int nstmts = ncopies * ceil_log2 (group_size) * group_size; > + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, > + stmt_info, 0, vect_body); > > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > > > diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > new file mode 100644 > index 0000000..6e3cb52 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 > -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ > + > +#define byte unsigned char > + > +void > +matrix_mul (byte *in, byte *out, int size) > +{ > + int i; > + for (i = 0; i < size; i++) > + { > + byte in0 = in[0]; > + byte in1 = in[1]; > + byte in2 = in[2]; > + byte out0, out1, out2, out3; > + out0 = in0 + in1; > + out1 = in0 + in2; > + out2 = in1 + in2; > + out3 = in0 + in1 + in2; > + out[0] = out0; > + out[1] = out1; > + out[2] = out2; > + out[3] = out3; > + in += 3; > + out += 4; > + } > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > > > On Wed, Apr 30, 2014 at 6:31 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > > Ping. > > > > On Fri, Apr 18, 2014 at 2:05 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >> Hi, > >> > >> Merged with current master the patch passes bootstrap and is giving > >> expected gains. > >> Patch and new tests are attached. > >> > >> ChangeLog: > >> > >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> > >> > >> * tree-vect-data-refs.c (vect_grouped_store_supported): New > >> check for stores group of length 3. > >> (vect_permute_store_chain): New permutations for stores group of > >> length 3. > >> (vect_grouped_load_supported): New check for loads group of length 3. > >> (vect_permute_load_chain): New permutations for loads group of length 3. > >> * tree-vect-stmts.c (vect_model_store_cost): Change cost > >> of vec_perm_shuffle for the new permutations. > >> (vect_model_load_cost): Ditto. > >> > >> ChangeLog for testsuite: > >> > >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> > >> > >> PR tree-optimization/52252 > >> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > >> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. > >> > >> Evgeny > >> > >> On Thu, Mar 6, 2014 at 6:44 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >>> Missed attachment. > >>> > >>> On Thu, Mar 6, 2014 at 6:42 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >>>> I've separated the patch into 2: cost model tuning and load/store > >>>> groups parallelism. > >>>> SLM tuning was partially introduced in the patch: > >>>> http://gcc.gnu.org/ml/gcc-patches/2014-03/msg00226.html > >>>> The patch introducing vectorization for load/store groups of size 3 attached. > >>>> > >>>> Is it ok for stage1? > >>>> > >>>> ChangeLog: > >>>> > >>>> 2014-03-06 Evgeny Stupachenko <evstupac@gmail.com> > >>>> > >>>> * tree-vect-data-refs.c (vect_grouped_store_supported): New > >>>> check for stores group of length 3. > >>>> (vect_permute_store_chain): New permutations for stores group of > >>>> length 3. > >>>> (vect_grouped_load_supported): New check for loads group of length 3. > >>>> (vect_permute_load_chain): New permutations for loads group of length 3. > >>>> * tree-vect-stmts.c (vect_model_store_cost): Change cost > >>>> of vec_perm_shuffle for the new permutations. > >>>> (vect_model_load_cost): Ditto. > >>>> > >>>> > >>>> > >>>> On Tue, Feb 11, 2014 at 7:19 PM, Richard Biener <rguenther@suse.de> wrote: > >>>>> On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: > >>>>> > >>>>>> Missed patch attached in plain-text. > >>>>>> > >>>>>> I have copyright assignment on file with the FSF covering work on GCC. > >>>>>> > >>>>>> Load/stores groups of length 3 is the most frequent non-power-of-2 > >>>>>> case. It is used in RGB image processing (like test case in PR52252). > >>>>>> For sure we can extend the patch to length 5 and more. However, this > >>>>>> potentially affect performance on some other architectures and > >>>>>> requires larger testing. So length 3 it is just first step.The > >>>>>> algorithm in the patch could be modified for a general case in several > >>>>>> steps. > >>>>>> > >>>>>> I understand that the patch should wait for the stage 1, however since > >>>>>> its ready we can discuss it right now and make some changes (like > >>>>>> general size of group). > >>>>> > >>>>> Other than that I'd like to see a vectorizer hook querying the cost of a > >>>>> vec_perm_const expansion instead of adding vec_perm_shuffle > >>>>> (thus requires the constant shuffle mask to be passed as well > >>>>> as the vector type). That's more useful for other uses that > >>>>> would require (arbitrary) shuffles. > >>>>> > >>>>> Didn't look at the rest of the patch yet - queued in my review > >>>>> pipeline. > >>>>> > >>>>> Thanks, > >>>>> Richard. > >>>>> > >>>>>> Thanks, > >>>>>> Evgeny > >>>>>> > >>>>>> On Tue, Feb 11, 2014 at 5:00 PM, Richard Biener <rguenther@suse.de> wrote: > >>>>>> > > >>>>>> > On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: > >>>>>> > > >>>>>> > > Hi, > >>>>>> > > > >>>>>> > > The patch gives an expected 3 times gain for the test case in the PR52252 > >>>>>> > > (and even 6 times for AVX2). > >>>>>> > > It passes make check and bootstrap on x86. > >>>>>> > > spec2000/spec2006 got no regressions/gains on x86. > >>>>>> > > > >>>>>> > > Is this patch ok? > >>>>>> > > >>>>>> > I've worked on generalizing the permutation support in the light > >>>>>> > of the availability of the generic shuffle support in the IL > >>>>>> > but hit some road-blocks in the way code-generation works for > >>>>>> > group loads with permutations (I don't remember if I posted all patches). > >>>>>> > > >>>>>> > This patch seems to be to a slightly different place but it again > >>>>>> > special-cases a specific permutation. Why's that? Why can't we > >>>>>> > support groups of size 7 for example? So - can this be generalized > >>>>>> > to support arbitrary non-power-of-two load/store groups? > >>>>>> > > >>>>>> > Other than that the patch has to wait for stage1 to open again, > >>>>>> > of course. And it misses a testcase. > >>>>>> > > >>>>>> > Btw, do you have a copyright assignment on file with the FSF covering > >>>>>> > work on GCC? > >>>>>> > > >>>>>> > Thanks, > >>>>>> > Richard. > >>>>>> > > >>>>>> > > ChangeLog: > >>>>>> > > > >>>>>> > > 2014-02-11 Evgeny Stupachenko <evstupac@gmail.com> > >>>>>> > > > >>>>>> > > * target.h (vect_cost_for_stmt): Defining new cost vec_perm_shuffle. > >>>>>> > > * tree-vect-data-refs.c (vect_grouped_store_supported): New > >>>>>> > > check for stores group of length 3. > >>>>>> > > (vect_permute_store_chain): New permutations for stores group of > >>>>>> > > length 3. > >>>>>> > > (vect_grouped_load_supported): New check for loads group of length > >>>>>> > > 3. > >>>>>> > > (vect_permute_load_chain): New permutations for loads group of > >>>>>> > > length 3. > >>>>>> > > * tree-vect-stmts.c (vect_model_store_cost): New cost > >>>>>> > > vec_perm_shuffle > >>>>>> > > for the new permutations. > >>>>>> > > (vect_model_load_cost): Ditto. > >>>>>> > > * config/aarch64/aarch64.c (builtin_vectorization_cost): Adding > >>>>>> > > vec_perm_shuffle cost as equvivalent of vec_perm cost. > >>>>>> > > * config/arm/arm.c: Ditto. > >>>>>> > > * config/rs6000/rs6000.c: Ditto. > >>>>>> > > * config/spu/spu.c: Ditto. > >>>>>> > > * config/i386/x86-tune.def (TARGET_SLOW_PHUFFB): Target for slow > >>>>>> > > byte > >>>>>> > > shuffle on some x86 architectures. > >>>>>> > > * config/i386/i386.h (processor_costs): Defining pshuffb cost. > >>>>>> > > * config/i386/i386.c (processor_costs): Adding pshuffb cost. > >>>>>> > > (ix86_builtin_vectorization_cost): Adding cost for the new > >>>>>> > > permutations. > >>>>>> > > Fixing cost for other permutations. > >>>>>> > > (expand_vec_perm_even_odd_1): Avoid byte shuffles when they are > >>>>>> > > slow (TARGET_SLOW_PHUFFB). > >>>>>> > > (ix86_add_stmt_cost): Adding cost when STMT is WIDEN_MULTIPLY. > >>>>>> > > Adding new shuffle cost only when byte shuffle is expected. > >>>>>> > > Fixing cost model for Silvermont. > >>>>>> > > > >>>>>> > > Thanks, > >>>>>> > > Evgeny > >>>>>> > > > >>>>>> > > >>>>>> > -- > >>>>>> > Richard Biener <rguenther@suse.de> > >>>>>> > SUSE / SUSE Labs > >>>>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > >>>>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer > >>>>>> > >>>>> > >>>>> -- > >>>>> Richard Biener <rguenther@suse.de> > >>>>> SUSE / SUSE Labs > >>>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > >>>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer > >
On Tue, 6 May 2014, Evgeny Stupachenko wrote: > Patch with fixes attached. Ok if bootstrap/regtest passes. Thanks, Richard. > Currently if-structure is as following: > + if (count == 3) > ... > + else > + { > + /* If length is not equal to 3 then only power of 2 is supported. */ > + gcc_assert (exact_log2 (count) != -1); > > For stores group I've created another mail thread. > > Thanks, > Evgeny > > > On Tue, May 6, 2014 at 3:47 PM, Richard Biener <rguenther@suse.de> wrote: > > On Tue, 6 May 2014, Evgeny Stupachenko wrote: > > > >> The patch on cost model was successfully committed. > >> I've separated the rest part of the patch on loads/stores group into > >> 2: on loads group and on stores group. > >> Below is first part on loads group. > >> > >> Bootstrap and make check passed on x86. > >> > >> Is it ok? > >> > >> ChangeLog: > >> > >> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> > >> > >> * tree-vect-data-refs.c (vect_grouped_load_supported): New > >> check for loads group of length 3. > >> (vect_permute_load_chain): New permutations for loads group of > >> length 3. > >> * tree-vect-stmts.c (vect_model_load_cost): Change cost > >> of vec_perm_shuffle for the new permutations. > >> > >> ChangeLog for testsuite: > >> > >> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> > >> > >> PR tree-optimization/52252 > >> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > >> > >> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > >> index 274cdbd..feafb38 100644 > >> --- a/gcc/tree-vect-data-refs.c > >> +++ b/gcc/tree-vect-data-refs.c > >> @@ -4812,36 +4812,74 @@ vect_grouped_load_supported (tree vectype, > >> unsigned HOST_WIDE_INT count) > >> { > >> enum machine_mode mode = TYPE_MODE (vectype); > >> > >> - /* vect_permute_load_chain requires the group size to be a power of two. */ > >> - if (exact_log2 (count) == -1) > >> + /* vect_permute_load_chain requires the group size to be equal to 3 or > >> + be a power of two. */ > >> + if (count != 3 && exact_log2 (count) == -1) > >> { > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "the size of the group of accesses" > >> - " is not a power of 2\n"); > >> + "the size of the group of accesses" > >> + " is not a power of 2 or not eqaul to 3\n"); > > > > equal > > > >> return false; > >> } > >> > >> /* Check that the permutation is supported. */ > >> if (VECTOR_MODE_P (mode)) > >> { > >> - unsigned int i, nelt = GET_MODE_NUNITS (mode); > >> + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); > >> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); > >> > >> - for (i = 0; i < nelt; i++) > >> - sel[i] = i * 2; > >> - if (can_vec_perm_p (mode, false, sel)) > >> + if (exact_log2 (count) != -1) > >> { > >> for (i = 0; i < nelt; i++) > >> - sel[i] = i * 2 + 1; > >> + sel[i] = i * 2; > >> if (can_vec_perm_p (mode, false, sel)) > >> - return true; > >> + { > >> + for (i = 0; i < nelt; i++) > >> + sel[i] = i * 2 + 1; > >> + if (can_vec_perm_p (mode, false, sel)) > >> + return true; > >> + } > >> + } > >> + else if (count == 3) > > > > Please structure this if as having special cases first and then an > > else with gcc_assert (exact_log2 (count)). > > > >> + { > >> + unsigned int k; > >> + for (k = 0; k < 3; k++) > >> + { > >> + for (i = 0; i < nelt; i++) > >> + if (3 * i + k < 2 * nelt) > >> + sel[i] = 3 * i + k; > >> + else > >> + sel[i] = 0; > >> + if (!can_vec_perm_p (mode, false, sel)) > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "shuffle of 3 loads is not supported by \ > >> + target\n"); > > > > Don't use multi-line strings but do > > > > "shuffle of ..." > > "target\n"); > > > > instead. > > > >> + return false; > >> + } > >> + for (i = 0, j = 0; i < nelt; i++) > >> + if (3 * i + k < 2 * nelt) > >> + sel[i] = i; > >> + else > >> + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); > >> + if (!can_vec_perm_p (mode, false, sel)) > >> + { > >> + if (dump_enabled_p ()) > >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> + "shuffle of 3 loads is not supported by \ > >> + target\n"); > > > > Likewise. > > > >> + return false; > >> + } > >> + } > >> + return true; > >> } > >> } > >> > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >> - "extract even/odd not supported by target\n"); > >> + "extract even/odd not supported by target\n"); > >> return false; > >> } > >> > >> @@ -4859,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, > >> unsigned HOST_WIDE_INT count) > >> /* Function vect_permute_load_chain. > >> > >> Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be > >> - a power of 2, generate extract_even/odd stmts to reorder the input data > >> - correctly. Return the final references for loads in RESULT_CHAIN. > >> + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder > >> + the input data correctly. Return the final references for loads in > >> + RESULT_CHAIN. > >> > >> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. > >> The input is 4 vectors each containing 8 elements. We assign a > >> number to each > >> @@ -4941,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, > >> { > >> tree data_ref, first_vect, second_vect; > >> tree perm_mask_even, perm_mask_odd; > >> + tree perm3_mask_low, perm3_mask_high; > >> gimple perm_stmt; > >> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); > >> unsigned int i, j, log_length = exact_log2 (length); > >> @@ -4951,45 +4991,99 @@ vect_permute_load_chain (vec<tree> dr_chain, > >> memcpy (result_chain->address (), dr_chain.address (), > >> length * sizeof (tree)); > >> > >> - for (i = 0; i < nelt; ++i) > >> - sel[i] = i * 2; > >> - perm_mask_even = vect_gen_perm_mask (vectype, sel); > >> - gcc_assert (perm_mask_even != NULL); > >> + if (log_length != (unsigned int)-1) > > > > Same for the if-structure - first handle all special values > > and then in the else handle power-of-two cases. > > > > Ok with those changes. > > > > Thanks, > > Richard. > > > >> + { > >> + for (i = 0; i < nelt; ++i) > >> + sel[i] = i * 2; > >> + perm_mask_even = vect_gen_perm_mask (vectype, sel); > >> + gcc_assert (perm_mask_even != NULL); > >> > >> - for (i = 0; i < nelt; ++i) > >> - sel[i] = i * 2 + 1; > >> - perm_mask_odd = vect_gen_perm_mask (vectype, sel); > >> - gcc_assert (perm_mask_odd != NULL); > >> + for (i = 0; i < nelt; ++i) > >> + sel[i] = i * 2 + 1; > >> + perm_mask_odd = vect_gen_perm_mask (vectype, sel); > >> + gcc_assert (perm_mask_odd != NULL); > >> > >> - for (i = 0; i < log_length; i++) > >> - { > >> - for (j = 0; j < length; j += 2) > >> + for (i = 0; i < log_length; i++) > >> { > >> - first_vect = dr_chain[j]; > >> - second_vect = dr_chain[j+1]; > >> + for (j = 0; j < length; j += 2) > >> + { > >> + first_vect = dr_chain[j]; > >> + second_vect = dr_chain[j+1]; > >> + > >> + /* data_ref = permute_even (first_data_ref, second_data_ref); */ > >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); > >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >> + first_vect, second_vect, > >> + perm_mask_even); > >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >> + (*result_chain)[j/2] = data_ref; > >> + > >> + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ > >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); > >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >> + first_vect, second_vect, > >> + perm_mask_odd); > >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >> + (*result_chain)[j/2+length/2] = data_ref; > >> + } > >> + memcpy (dr_chain.address (), result_chain->address (), > >> + length * sizeof (tree)); > >> + } > >> + } > >> + /* length is not a power of 2. */ > >> + else > >> + { > >> + unsigned int k; > >> > >> - /* data_ref = permute_even (first_data_ref, second_data_ref); */ > >> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); > >> + /* currently only length 3 is supported as most frequent case. */ > >> + gcc_assert (length == 3); > >> + > >> + for (k = 0; k < 3; k++) > >> + { > >> + for (i = 0; i < nelt; i++) > >> + if (3 * i + k < 2 * nelt) > >> + sel[i] = 3 * i + k; > >> + else > >> + sel[i] = 0; > >> + perm3_mask_low = vect_gen_perm_mask (vectype, sel); > >> + gcc_assert (perm3_mask_low != NULL); > >> + > >> + for (i = 0, j = 0; i < nelt; i++) > >> + if (3 * i + k < 2 * nelt) > >> + sel[i] = i; > >> + else > >> + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); > >> + > >> + perm3_mask_high = vect_gen_perm_mask (vectype, sel); > >> + gcc_assert (perm3_mask_high != NULL); > >> + > >> + first_vect = dr_chain[0]; > >> + second_vect = dr_chain[1]; > >> + > >> + /* Create interleaving stmt (low part of): > >> + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, > >> + ...}> */ > >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); > >> perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >> first_vect, second_vect, > >> - perm_mask_even); > >> + perm3_mask_low); > >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >> - (*result_chain)[j/2] = data_ref; > >> > >> - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ > >> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); > >> + /* Create interleaving stmt (high part of): > >> + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, > >> + ...}> */ > >> + first_vect = data_ref; > >> + second_vect = dr_chain[2]; > >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); > >> perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >> first_vect, second_vect, > >> - perm_mask_odd); > >> + perm3_mask_high); > >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >> - (*result_chain)[j/2+length/2] = data_ref; > >> + (*result_chain)[k] = data_ref; > >> } > >> - memcpy (dr_chain.address (), result_chain->address (), > >> - length * sizeof (tree)); > >> } > >> } > >> > >> - > >> /* Function vect_transform_grouped_load. > >> > >> Given a chain of input interleaved data-refs (in DR_CHAIN), build statements > >> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > >> index 1a51d6d..b87c143 100644 > >> --- a/gcc/tree-vect-stmts.c > >> +++ b/gcc/tree-vect-stmts.c > >> @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, > >> int ncopies, > >> include the cost of the permutes. */ > >> if (!load_lanes_p && group_size > 1) > >> { > >> - /* Uses an even and odd extract operations for each needed permute. */ > >> - int nstmts = ncopies * exact_log2 (group_size) * group_size; > >> - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, > >> - stmt_info, 0, vect_body); > >> + /* Uses an even and odd extract operations or shuffle operations > >> + for each needed permute. */ > >> + int nstmts = ncopies * ceil_log2 (group_size) * group_size; > >> + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, > >> + stmt_info, 0, vect_body); > >> > >> if (dump_enabled_p ()) > >> dump_printf_loc (MSG_NOTE, vect_location, > >> > >> > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > >> b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > >> new file mode 100644 > >> index 0000000..6e3cb52 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c > >> @@ -0,0 +1,30 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 > >> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ > >> + > >> +#define byte unsigned char > >> + > >> +void > >> +matrix_mul (byte *in, byte *out, int size) > >> +{ > >> + int i; > >> + for (i = 0; i < size; i++) > >> + { > >> + byte in0 = in[0]; > >> + byte in1 = in[1]; > >> + byte in2 = in[2]; > >> + byte out0, out1, out2, out3; > >> + out0 = in0 + in1; > >> + out1 = in0 + in2; > >> + out2 = in1 + in2; > >> + out3 = in0 + in1 + in2; > >> + out[0] = out0; > >> + out[1] = out1; > >> + out[2] = out2; > >> + out[3] = out3; > >> + in += 3; > >> + out += 4; > >> + } > >> +} > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > >> +/* { dg-final { cleanup-tree-dump "vect" } } */ > >> > >> > >> On Wed, Apr 30, 2014 at 6:31 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >> > Ping. > >> > > >> > On Fri, Apr 18, 2014 at 2:05 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >> >> Hi, > >> >> > >> >> Merged with current master the patch passes bootstrap and is giving > >> >> expected gains. > >> >> Patch and new tests are attached. > >> >> > >> >> ChangeLog: > >> >> > >> >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> > >> >> > >> >> * tree-vect-data-refs.c (vect_grouped_store_supported): New > >> >> check for stores group of length 3. > >> >> (vect_permute_store_chain): New permutations for stores group of > >> >> length 3. > >> >> (vect_grouped_load_supported): New check for loads group of length 3. > >> >> (vect_permute_load_chain): New permutations for loads group of length 3. > >> >> * tree-vect-stmts.c (vect_model_store_cost): Change cost > >> >> of vec_perm_shuffle for the new permutations. > >> >> (vect_model_load_cost): Ditto. > >> >> > >> >> ChangeLog for testsuite: > >> >> > >> >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> > >> >> > >> >> PR tree-optimization/52252 > >> >> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > >> >> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. > >> >> > >> >> Evgeny > >> >> > >> >> On Thu, Mar 6, 2014 at 6:44 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >> >>> Missed attachment. > >> >>> > >> >>> On Thu, Mar 6, 2014 at 6:42 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: > >> >>>> I've separated the patch into 2: cost model tuning and load/store > >> >>>> groups parallelism. > >> >>>> SLM tuning was partially introduced in the patch: > >> >>>> http://gcc.gnu.org/ml/gcc-patches/2014-03/msg00226.html > >> >>>> The patch introducing vectorization for load/store groups of size 3 attached. > >> >>>> > >> >>>> Is it ok for stage1? > >> >>>> > >> >>>> ChangeLog: > >> >>>> > >> >>>> 2014-03-06 Evgeny Stupachenko <evstupac@gmail.com> > >> >>>> > >> >>>> * tree-vect-data-refs.c (vect_grouped_store_supported): New > >> >>>> check for stores group of length 3. > >> >>>> (vect_permute_store_chain): New permutations for stores group of > >> >>>> length 3. > >> >>>> (vect_grouped_load_supported): New check for loads group of length 3. > >> >>>> (vect_permute_load_chain): New permutations for loads group of length 3. > >> >>>> * tree-vect-stmts.c (vect_model_store_cost): Change cost > >> >>>> of vec_perm_shuffle for the new permutations. > >> >>>> (vect_model_load_cost): Ditto. > >> >>>> > >> >>>> > >> >>>> > >> >>>> On Tue, Feb 11, 2014 at 7:19 PM, Richard Biener <rguenther@suse.de> wrote: > >> >>>>> On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: > >> >>>>> > >> >>>>>> Missed patch attached in plain-text. > >> >>>>>> > >> >>>>>> I have copyright assignment on file with the FSF covering work on GCC. > >> >>>>>> > >> >>>>>> Load/stores groups of length 3 is the most frequent non-power-of-2 > >> >>>>>> case. It is used in RGB image processing (like test case in PR52252). > >> >>>>>> For sure we can extend the patch to length 5 and more. However, this > >> >>>>>> potentially affect performance on some other architectures and > >> >>>>>> requires larger testing. So length 3 it is just first step.The > >> >>>>>> algorithm in the patch could be modified for a general case in several > >> >>>>>> steps. > >> >>>>>> > >> >>>>>> I understand that the patch should wait for the stage 1, however since > >> >>>>>> its ready we can discuss it right now and make some changes (like > >> >>>>>> general size of group). > >> >>>>> > >> >>>>> Other than that I'd like to see a vectorizer hook querying the cost of a > >> >>>>> vec_perm_const expansion instead of adding vec_perm_shuffle > >> >>>>> (thus requires the constant shuffle mask to be passed as well > >> >>>>> as the vector type). That's more useful for other uses that > >> >>>>> would require (arbitrary) shuffles. > >> >>>>> > >> >>>>> Didn't look at the rest of the patch yet - queued in my review > >> >>>>> pipeline. > >> >>>>> > >> >>>>> Thanks, > >> >>>>> Richard. > >> >>>>> > >> >>>>>> Thanks, > >> >>>>>> Evgeny > >> >>>>>> > >> >>>>>> On Tue, Feb 11, 2014 at 5:00 PM, Richard Biener <rguenther@suse.de> wrote: > >> >>>>>> > > >> >>>>>> > On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: > >> >>>>>> > > >> >>>>>> > > Hi, > >> >>>>>> > > > >> >>>>>> > > The patch gives an expected 3 times gain for the test case in the PR52252 > >> >>>>>> > > (and even 6 times for AVX2). > >> >>>>>> > > It passes make check and bootstrap on x86. > >> >>>>>> > > spec2000/spec2006 got no regressions/gains on x86. > >> >>>>>> > > > >> >>>>>> > > Is this patch ok? > >> >>>>>> > > >> >>>>>> > I've worked on generalizing the permutation support in the light > >> >>>>>> > of the availability of the generic shuffle support in the IL > >> >>>>>> > but hit some road-blocks in the way code-generation works for > >> >>>>>> > group loads with permutations (I don't remember if I posted all patches). > >> >>>>>> > > >> >>>>>> > This patch seems to be to a slightly different place but it again > >> >>>>>> > special-cases a specific permutation. Why's that? Why can't we > >> >>>>>> > support groups of size 7 for example? So - can this be generalized > >> >>>>>> > to support arbitrary non-power-of-two load/store groups? > >> >>>>>> > > >> >>>>>> > Other than that the patch has to wait for stage1 to open again, > >> >>>>>> > of course. And it misses a testcase. > >> >>>>>> > > >> >>>>>> > Btw, do you have a copyright assignment on file with the FSF covering > >> >>>>>> > work on GCC? > >> >>>>>> > > >> >>>>>> > Thanks, > >> >>>>>> > Richard. > >> >>>>>> > > >> >>>>>> > > ChangeLog: > >> >>>>>> > > > >> >>>>>> > > 2014-02-11 Evgeny Stupachenko <evstupac@gmail.com> > >> >>>>>> > > > >> >>>>>> > > * target.h (vect_cost_for_stmt): Defining new cost vec_perm_shuffle. > >> >>>>>> > > * tree-vect-data-refs.c (vect_grouped_store_supported): New > >> >>>>>> > > check for stores group of length 3. > >> >>>>>> > > (vect_permute_store_chain): New permutations for stores group of > >> >>>>>> > > length 3. > >> >>>>>> > > (vect_grouped_load_supported): New check for loads group of length > >> >>>>>> > > 3. > >> >>>>>> > > (vect_permute_load_chain): New permutations for loads group of > >> >>>>>> > > length 3. > >> >>>>>> > > * tree-vect-stmts.c (vect_model_store_cost): New cost > >> >>>>>> > > vec_perm_shuffle > >> >>>>>> > > for the new permutations. > >> >>>>>> > > (vect_model_load_cost): Ditto. > >> >>>>>> > > * config/aarch64/aarch64.c (builtin_vectorization_cost): Adding > >> >>>>>> > > vec_perm_shuffle cost as equvivalent of vec_perm cost. > >> >>>>>> > > * config/arm/arm.c: Ditto. > >> >>>>>> > > * config/rs6000/rs6000.c: Ditto. > >> >>>>>> > > * config/spu/spu.c: Ditto. > >> >>>>>> > > * config/i386/x86-tune.def (TARGET_SLOW_PHUFFB): Target for slow > >> >>>>>> > > byte > >> >>>>>> > > shuffle on some x86 architectures. > >> >>>>>> > > * config/i386/i386.h (processor_costs): Defining pshuffb cost. > >> >>>>>> > > * config/i386/i386.c (processor_costs): Adding pshuffb cost. > >> >>>>>> > > (ix86_builtin_vectorization_cost): Adding cost for the new > >> >>>>>> > > permutations. > >> >>>>>> > > Fixing cost for other permutations. > >> >>>>>> > > (expand_vec_perm_even_odd_1): Avoid byte shuffles when they are > >> >>>>>> > > slow (TARGET_SLOW_PHUFFB). > >> >>>>>> > > (ix86_add_stmt_cost): Adding cost when STMT is WIDEN_MULTIPLY. > >> >>>>>> > > Adding new shuffle cost only when byte shuffle is expected. > >> >>>>>> > > Fixing cost model for Silvermont. > >> >>>>>> > > > >> >>>>>> > > Thanks, > >> >>>>>> > > Evgeny > >> >>>>>> > > > >> >>>>>> > > >> >>>>>> > -- > >> >>>>>> > Richard Biener <rguenther@suse.de> > >> >>>>>> > SUSE / SUSE Labs > >> >>>>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > >> >>>>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer > >> >>>>>> > >> >>>>> > >> >>>>> -- > >> >>>>> Richard Biener <rguenther@suse.de> > >> >>>>> SUSE / SUSE Labs > >> >>>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > >> >>>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer > >> > >> > > > > -- > > Richard Biener <rguenther@suse.de> > > SUSE / SUSE Labs > > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer >
Patch with fixes attached. Currently if-structure is as following: + if (count == 3) ... + else + { + /* If length is not equal to 3 then only power of 2 is supported. */ + gcc_assert (exact_log2 (count) != -1); For stores group I've created another mail thread. Thanks, Evgeny On Tue, May 6, 2014 at 3:47 PM, Richard Biener <rguenther@suse.de> wrote: > On Tue, 6 May 2014, Evgeny Stupachenko wrote: > >> The patch on cost model was successfully committed. >> I've separated the rest part of the patch on loads/stores group into >> 2: on loads group and on stores group. >> Below is first part on loads group. >> >> Bootstrap and make check passed on x86. >> >> Is it ok? >> >> ChangeLog: >> >> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> >> >> * tree-vect-data-refs.c (vect_grouped_load_supported): New >> check for loads group of length 3. >> (vect_permute_load_chain): New permutations for loads group of >> length 3. >> * tree-vect-stmts.c (vect_model_load_cost): Change cost >> of vec_perm_shuffle for the new permutations. >> >> ChangeLog for testsuite: >> >> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> >> >> PR tree-optimization/52252 >> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. >> >> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c >> index 274cdbd..feafb38 100644 >> --- a/gcc/tree-vect-data-refs.c >> +++ b/gcc/tree-vect-data-refs.c >> @@ -4812,36 +4812,74 @@ vect_grouped_load_supported (tree vectype, >> unsigned HOST_WIDE_INT count) >> { >> enum machine_mode mode = TYPE_MODE (vectype); >> >> - /* vect_permute_load_chain requires the group size to be a power of two. */ >> - if (exact_log2 (count) == -1) >> + /* vect_permute_load_chain requires the group size to be equal to 3 or >> + be a power of two. */ >> + if (count != 3 && exact_log2 (count) == -1) >> { >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> - "the size of the group of accesses" >> - " is not a power of 2\n"); >> + "the size of the group of accesses" >> + " is not a power of 2 or not eqaul to 3\n"); > > equal > >> return false; >> } >> >> /* Check that the permutation is supported. */ >> if (VECTOR_MODE_P (mode)) >> { >> - unsigned int i, nelt = GET_MODE_NUNITS (mode); >> + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); >> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); >> >> - for (i = 0; i < nelt; i++) >> - sel[i] = i * 2; >> - if (can_vec_perm_p (mode, false, sel)) >> + if (exact_log2 (count) != -1) >> { >> for (i = 0; i < nelt; i++) >> - sel[i] = i * 2 + 1; >> + sel[i] = i * 2; >> if (can_vec_perm_p (mode, false, sel)) >> - return true; >> + { >> + for (i = 0; i < nelt; i++) >> + sel[i] = i * 2 + 1; >> + if (can_vec_perm_p (mode, false, sel)) >> + return true; >> + } >> + } >> + else if (count == 3) > > Please structure this if as having special cases first and then an > else with gcc_assert (exact_log2 (count)). > >> + { >> + unsigned int k; >> + for (k = 0; k < 3; k++) >> + { >> + for (i = 0; i < nelt; i++) >> + if (3 * i + k < 2 * nelt) >> + sel[i] = 3 * i + k; >> + else >> + sel[i] = 0; >> + if (!can_vec_perm_p (mode, false, sel)) >> + { >> + if (dump_enabled_p ()) >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> + "shuffle of 3 loads is not supported by \ >> + target\n"); > > Don't use multi-line strings but do > > "shuffle of ..." > "target\n"); > > instead. > >> + return false; >> + } >> + for (i = 0, j = 0; i < nelt; i++) >> + if (3 * i + k < 2 * nelt) >> + sel[i] = i; >> + else >> + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); >> + if (!can_vec_perm_p (mode, false, sel)) >> + { >> + if (dump_enabled_p ()) >> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> + "shuffle of 3 loads is not supported by \ >> + target\n"); > > Likewise. > >> + return false; >> + } >> + } >> + return true; >> } >> } >> >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, >> - "extract even/odd not supported by target\n"); >> + "extract even/odd not supported by target\n"); >> return false; >> } >> >> @@ -4859,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, >> unsigned HOST_WIDE_INT count) >> /* Function vect_permute_load_chain. >> >> Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be >> - a power of 2, generate extract_even/odd stmts to reorder the input data >> - correctly. Return the final references for loads in RESULT_CHAIN. >> + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder >> + the input data correctly. Return the final references for loads in >> + RESULT_CHAIN. >> >> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. >> The input is 4 vectors each containing 8 elements. We assign a >> number to each >> @@ -4941,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, >> { >> tree data_ref, first_vect, second_vect; >> tree perm_mask_even, perm_mask_odd; >> + tree perm3_mask_low, perm3_mask_high; >> gimple perm_stmt; >> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); >> unsigned int i, j, log_length = exact_log2 (length); >> @@ -4951,45 +4991,99 @@ vect_permute_load_chain (vec<tree> dr_chain, >> memcpy (result_chain->address (), dr_chain.address (), >> length * sizeof (tree)); >> >> - for (i = 0; i < nelt; ++i) >> - sel[i] = i * 2; >> - perm_mask_even = vect_gen_perm_mask (vectype, sel); >> - gcc_assert (perm_mask_even != NULL); >> + if (log_length != (unsigned int)-1) > > Same for the if-structure - first handle all special values > and then in the else handle power-of-two cases. > > Ok with those changes. > > Thanks, > Richard. > >> + { >> + for (i = 0; i < nelt; ++i) >> + sel[i] = i * 2; >> + perm_mask_even = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm_mask_even != NULL); >> >> - for (i = 0; i < nelt; ++i) >> - sel[i] = i * 2 + 1; >> - perm_mask_odd = vect_gen_perm_mask (vectype, sel); >> - gcc_assert (perm_mask_odd != NULL); >> + for (i = 0; i < nelt; ++i) >> + sel[i] = i * 2 + 1; >> + perm_mask_odd = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm_mask_odd != NULL); >> >> - for (i = 0; i < log_length; i++) >> - { >> - for (j = 0; j < length; j += 2) >> + for (i = 0; i < log_length; i++) >> { >> - first_vect = dr_chain[j]; >> - second_vect = dr_chain[j+1]; >> + for (j = 0; j < length; j += 2) >> + { >> + first_vect = dr_chain[j]; >> + second_vect = dr_chain[j+1]; >> + >> + /* data_ref = permute_even (first_data_ref, second_data_ref); */ >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> + first_vect, second_vect, >> + perm_mask_even); >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> + (*result_chain)[j/2] = data_ref; >> + >> + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); >> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> + first_vect, second_vect, >> + perm_mask_odd); >> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> + (*result_chain)[j/2+length/2] = data_ref; >> + } >> + memcpy (dr_chain.address (), result_chain->address (), >> + length * sizeof (tree)); >> + } >> + } >> + /* length is not a power of 2. */ >> + else >> + { >> + unsigned int k; >> >> - /* data_ref = permute_even (first_data_ref, second_data_ref); */ >> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); >> + /* currently only length 3 is supported as most frequent case. */ >> + gcc_assert (length == 3); >> + >> + for (k = 0; k < 3; k++) >> + { >> + for (i = 0; i < nelt; i++) >> + if (3 * i + k < 2 * nelt) >> + sel[i] = 3 * i + k; >> + else >> + sel[i] = 0; >> + perm3_mask_low = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm3_mask_low != NULL); >> + >> + for (i = 0, j = 0; i < nelt; i++) >> + if (3 * i + k < 2 * nelt) >> + sel[i] = i; >> + else >> + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); >> + >> + perm3_mask_high = vect_gen_perm_mask (vectype, sel); >> + gcc_assert (perm3_mask_high != NULL); >> + >> + first_vect = dr_chain[0]; >> + second_vect = dr_chain[1]; >> + >> + /* Create interleaving stmt (low part of): >> + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, >> + ...}> */ >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); >> perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> first_vect, second_vect, >> - perm_mask_even); >> + perm3_mask_low); >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> - (*result_chain)[j/2] = data_ref; >> >> - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ >> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); >> + /* Create interleaving stmt (high part of): >> + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, >> + ...}> */ >> + first_vect = data_ref; >> + second_vect = dr_chain[2]; >> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); >> perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >> first_vect, second_vect, >> - perm_mask_odd); >> + perm3_mask_high); >> vect_finish_stmt_generation (stmt, perm_stmt, gsi); >> - (*result_chain)[j/2+length/2] = data_ref; >> + (*result_chain)[k] = data_ref; >> } >> - memcpy (dr_chain.address (), result_chain->address (), >> - length * sizeof (tree)); >> } >> } >> >> - >> /* Function vect_transform_grouped_load. >> >> Given a chain of input interleaved data-refs (in DR_CHAIN), build statements >> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c >> index 1a51d6d..b87c143 100644 >> --- a/gcc/tree-vect-stmts.c >> +++ b/gcc/tree-vect-stmts.c >> @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, >> int ncopies, >> include the cost of the permutes. */ >> if (!load_lanes_p && group_size > 1) >> { >> - /* Uses an even and odd extract operations for each needed permute. */ >> - int nstmts = ncopies * exact_log2 (group_size) * group_size; >> - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, >> - stmt_info, 0, vect_body); >> + /* Uses an even and odd extract operations or shuffle operations >> + for each needed permute. */ >> + int nstmts = ncopies * ceil_log2 (group_size) * group_size; >> + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, >> + stmt_info, 0, vect_body); >> >> if (dump_enabled_p ()) >> dump_printf_loc (MSG_NOTE, vect_location, >> >> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c >> b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c >> new file mode 100644 >> index 0000000..6e3cb52 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c >> @@ -0,0 +1,30 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 >> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ >> + >> +#define byte unsigned char >> + >> +void >> +matrix_mul (byte *in, byte *out, int size) >> +{ >> + int i; >> + for (i = 0; i < size; i++) >> + { >> + byte in0 = in[0]; >> + byte in1 = in[1]; >> + byte in2 = in[2]; >> + byte out0, out1, out2, out3; >> + out0 = in0 + in1; >> + out1 = in0 + in2; >> + out2 = in1 + in2; >> + out3 = in0 + in1 + in2; >> + out[0] = out0; >> + out[1] = out1; >> + out[2] = out2; >> + out[3] = out3; >> + in += 3; >> + out += 4; >> + } >> +} >> + >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ >> +/* { dg-final { cleanup-tree-dump "vect" } } */ >> >> >> On Wed, Apr 30, 2014 at 6:31 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> > Ping. >> > >> > On Fri, Apr 18, 2014 at 2:05 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> >> Hi, >> >> >> >> Merged with current master the patch passes bootstrap and is giving >> >> expected gains. >> >> Patch and new tests are attached. >> >> >> >> ChangeLog: >> >> >> >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> >> >> >> >> * tree-vect-data-refs.c (vect_grouped_store_supported): New >> >> check for stores group of length 3. >> >> (vect_permute_store_chain): New permutations for stores group of >> >> length 3. >> >> (vect_grouped_load_supported): New check for loads group of length 3. >> >> (vect_permute_load_chain): New permutations for loads group of length 3. >> >> * tree-vect-stmts.c (vect_model_store_cost): Change cost >> >> of vec_perm_shuffle for the new permutations. >> >> (vect_model_load_cost): Ditto. >> >> >> >> ChangeLog for testsuite: >> >> >> >> 2014-04-18 Evgeny Stupachenko <evstupac@gmail.com> >> >> >> >> PR tree-optimization/52252 >> >> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. >> >> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. >> >> >> >> Evgeny >> >> >> >> On Thu, Mar 6, 2014 at 6:44 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> >>> Missed attachment. >> >>> >> >>> On Thu, Mar 6, 2014 at 6:42 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote: >> >>>> I've separated the patch into 2: cost model tuning and load/store >> >>>> groups parallelism. >> >>>> SLM tuning was partially introduced in the patch: >> >>>> http://gcc.gnu.org/ml/gcc-patches/2014-03/msg00226.html >> >>>> The patch introducing vectorization for load/store groups of size 3 attached. >> >>>> >> >>>> Is it ok for stage1? >> >>>> >> >>>> ChangeLog: >> >>>> >> >>>> 2014-03-06 Evgeny Stupachenko <evstupac@gmail.com> >> >>>> >> >>>> * tree-vect-data-refs.c (vect_grouped_store_supported): New >> >>>> check for stores group of length 3. >> >>>> (vect_permute_store_chain): New permutations for stores group of >> >>>> length 3. >> >>>> (vect_grouped_load_supported): New check for loads group of length 3. >> >>>> (vect_permute_load_chain): New permutations for loads group of length 3. >> >>>> * tree-vect-stmts.c (vect_model_store_cost): Change cost >> >>>> of vec_perm_shuffle for the new permutations. >> >>>> (vect_model_load_cost): Ditto. >> >>>> >> >>>> >> >>>> >> >>>> On Tue, Feb 11, 2014 at 7:19 PM, Richard Biener <rguenther@suse.de> wrote: >> >>>>> On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: >> >>>>> >> >>>>>> Missed patch attached in plain-text. >> >>>>>> >> >>>>>> I have copyright assignment on file with the FSF covering work on GCC. >> >>>>>> >> >>>>>> Load/stores groups of length 3 is the most frequent non-power-of-2 >> >>>>>> case. It is used in RGB image processing (like test case in PR52252). >> >>>>>> For sure we can extend the patch to length 5 and more. However, this >> >>>>>> potentially affect performance on some other architectures and >> >>>>>> requires larger testing. So length 3 it is just first step.The >> >>>>>> algorithm in the patch could be modified for a general case in several >> >>>>>> steps. >> >>>>>> >> >>>>>> I understand that the patch should wait for the stage 1, however since >> >>>>>> its ready we can discuss it right now and make some changes (like >> >>>>>> general size of group). >> >>>>> >> >>>>> Other than that I'd like to see a vectorizer hook querying the cost of a >> >>>>> vec_perm_const expansion instead of adding vec_perm_shuffle >> >>>>> (thus requires the constant shuffle mask to be passed as well >> >>>>> as the vector type). That's more useful for other uses that >> >>>>> would require (arbitrary) shuffles. >> >>>>> >> >>>>> Didn't look at the rest of the patch yet - queued in my review >> >>>>> pipeline. >> >>>>> >> >>>>> Thanks, >> >>>>> Richard. >> >>>>> >> >>>>>> Thanks, >> >>>>>> Evgeny >> >>>>>> >> >>>>>> On Tue, Feb 11, 2014 at 5:00 PM, Richard Biener <rguenther@suse.de> wrote: >> >>>>>> > >> >>>>>> > On Tue, 11 Feb 2014, Evgeny Stupachenko wrote: >> >>>>>> > >> >>>>>> > > Hi, >> >>>>>> > > >> >>>>>> > > The patch gives an expected 3 times gain for the test case in the PR52252 >> >>>>>> > > (and even 6 times for AVX2). >> >>>>>> > > It passes make check and bootstrap on x86. >> >>>>>> > > spec2000/spec2006 got no regressions/gains on x86. >> >>>>>> > > >> >>>>>> > > Is this patch ok? >> >>>>>> > >> >>>>>> > I've worked on generalizing the permutation support in the light >> >>>>>> > of the availability of the generic shuffle support in the IL >> >>>>>> > but hit some road-blocks in the way code-generation works for >> >>>>>> > group loads with permutations (I don't remember if I posted all patches). >> >>>>>> > >> >>>>>> > This patch seems to be to a slightly different place but it again >> >>>>>> > special-cases a specific permutation. Why's that? Why can't we >> >>>>>> > support groups of size 7 for example? So - can this be generalized >> >>>>>> > to support arbitrary non-power-of-two load/store groups? >> >>>>>> > >> >>>>>> > Other than that the patch has to wait for stage1 to open again, >> >>>>>> > of course. And it misses a testcase. >> >>>>>> > >> >>>>>> > Btw, do you have a copyright assignment on file with the FSF covering >> >>>>>> > work on GCC? >> >>>>>> > >> >>>>>> > Thanks, >> >>>>>> > Richard. >> >>>>>> > >> >>>>>> > > ChangeLog: >> >>>>>> > > >> >>>>>> > > 2014-02-11 Evgeny Stupachenko <evstupac@gmail.com> >> >>>>>> > > >> >>>>>> > > * target.h (vect_cost_for_stmt): Defining new cost vec_perm_shuffle. >> >>>>>> > > * tree-vect-data-refs.c (vect_grouped_store_supported): New >> >>>>>> > > check for stores group of length 3. >> >>>>>> > > (vect_permute_store_chain): New permutations for stores group of >> >>>>>> > > length 3. >> >>>>>> > > (vect_grouped_load_supported): New check for loads group of length >> >>>>>> > > 3. >> >>>>>> > > (vect_permute_load_chain): New permutations for loads group of >> >>>>>> > > length 3. >> >>>>>> > > * tree-vect-stmts.c (vect_model_store_cost): New cost >> >>>>>> > > vec_perm_shuffle >> >>>>>> > > for the new permutations. >> >>>>>> > > (vect_model_load_cost): Ditto. >> >>>>>> > > * config/aarch64/aarch64.c (builtin_vectorization_cost): Adding >> >>>>>> > > vec_perm_shuffle cost as equvivalent of vec_perm cost. >> >>>>>> > > * config/arm/arm.c: Ditto. >> >>>>>> > > * config/rs6000/rs6000.c: Ditto. >> >>>>>> > > * config/spu/spu.c: Ditto. >> >>>>>> > > * config/i386/x86-tune.def (TARGET_SLOW_PHUFFB): Target for slow >> >>>>>> > > byte >> >>>>>> > > shuffle on some x86 architectures. >> >>>>>> > > * config/i386/i386.h (processor_costs): Defining pshuffb cost. >> >>>>>> > > * config/i386/i386.c (processor_costs): Adding pshuffb cost. >> >>>>>> > > (ix86_builtin_vectorization_cost): Adding cost for the new >> >>>>>> > > permutations. >> >>>>>> > > Fixing cost for other permutations. >> >>>>>> > > (expand_vec_perm_even_odd_1): Avoid byte shuffles when they are >> >>>>>> > > slow (TARGET_SLOW_PHUFFB). >> >>>>>> > > (ix86_add_stmt_cost): Adding cost when STMT is WIDEN_MULTIPLY. >> >>>>>> > > Adding new shuffle cost only when byte shuffle is expected. >> >>>>>> > > Fixing cost model for Silvermont. >> >>>>>> > > >> >>>>>> > > Thanks, >> >>>>>> > > Evgeny >> >>>>>> > > >> >>>>>> > >> >>>>>> > -- >> >>>>>> > Richard Biener <rguenther@suse.de> >> >>>>>> > SUSE / SUSE Labs >> >>>>>> > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 >> >>>>>> > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer >> >>>>>> >> >>>>> >> >>>>> -- >> >>>>> Richard Biener <rguenther@suse.de> >> >>>>> SUSE / SUSE Labs >> >>>>> SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 >> >>>>> GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer >> >> > > -- > Richard Biener <rguenther@suse.de> > SUSE / SUSE Labs > SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 > GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
Evgeny Stupachenko <evstupac@gmail.com> writes: > Patch with fixes attached. > Currently if-structure is as following: > + if (count == 3) > ... > + else > + { > + /* If length is not equal to 3 then only power of 2 is supported. */ > + gcc_assert (exact_log2 (count) != -1); > > For stores group I've created another mail thread. [...] >>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> >>> >>> PR tree-optimization/52252 >>> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. This test FAILs on sparc-sun-solaris2.11, both 32 and 64-bit: FAIL: gcc.dg/vect/pr52252-ld.c scan-tree-dump-times vect "vectorized 1 loops" 1 FAIL: gcc.dg/vect/pr52252-ld.c -flto -ffat-lto-objects scan-tree-dump-times vect "vectorized 1 loops" 1 The dumps have /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not vectorized: relevant stmt not supported: in0_9 = *in_27; /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:7:1: note: vectorized 0 loops in function. Rainer
The test is on general changes. However I was able to test it on x86 only. I see 2 possible solutions: 1. Set the test for x86 only. 2. Modify it so that it will pass on sparc-sun-solaris2. If 2. is not acceptable I'll create patch for 1. Currently I don't see why "in0_9 = *in_27" is not supported. Does the test fail because of unsupported permutation? On Mon, May 12, 2014 at 7:14 PM, Rainer Orth <ro@cebitec.uni-bielefeld.de> wrote: > Evgeny Stupachenko <evstupac@gmail.com> writes: > >> Patch with fixes attached. >> Currently if-structure is as following: >> + if (count == 3) >> ... >> + else >> + { >> + /* If length is not equal to 3 then only power of 2 is supported. */ >> + gcc_assert (exact_log2 (count) != -1); >> >> For stores group I've created another mail thread. > [...] >>>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> >>>> >>>> PR tree-optimization/52252 >>>> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > > This test FAILs on sparc-sun-solaris2.11, both 32 and 64-bit: > > FAIL: gcc.dg/vect/pr52252-ld.c scan-tree-dump-times vect "vectorized 1 loops" 1 > FAIL: gcc.dg/vect/pr52252-ld.c -flto -ffat-lto-objects scan-tree-dump-times vect "vectorized 1 loops" 1 > > The dumps have > > /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not vectorized: relevant stmt not supported: in0_9 = *in_27; > /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:7:1: note: vectorized 0 loops in function. > > Rainer > > -- > ----------------------------------------------------------------------------- > Rainer Orth, Center for Biotechnology, Bielefeld University
On Mon, 12 May 2014, Evgeny Stupachenko wrote: > The test is on general changes. However I was able to test it on x86 only. > I see 2 possible solutions: > 1. Set the test for x86 only. > 2. Modify it so that it will pass on sparc-sun-solaris2. > > If 2. is not acceptable I'll create patch for 1. > Currently I don't see why "in0_9 = *in_27" is not supported. Does the > test fail because of unsupported permutation? The test uses /* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ that's bogus. You shouldn't add any dg-options. Instead use proper dg-effective-target checks for the mssse3 feature you are using. Note that the dg-final checking is applied regardless of the options above are applied or not. Why does the test only succeed with -mssse3 btw? The proper way to restrict the test to a single target is to use /* { dg-skip-if "why" { ! { x86_64-*-* i?86-*-* } } } */ Sorry for not catching this in the review. Richard. > > On Mon, May 12, 2014 at 7:14 PM, Rainer Orth > <ro@cebitec.uni-bielefeld.de> wrote: > > Evgeny Stupachenko <evstupac@gmail.com> writes: > > > >> Patch with fixes attached. > >> Currently if-structure is as following: > >> + if (count == 3) > >> ... > >> + else > >> + { > >> + /* If length is not equal to 3 then only power of 2 is supported. */ > >> + gcc_assert (exact_log2 (count) != -1); > >> > >> For stores group I've created another mail thread. > > [...] > >>>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com> > >>>> > >>>> PR tree-optimization/52252 > >>>> * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3. > > > > This test FAILs on sparc-sun-solaris2.11, both 32 and 64-bit: > > > > FAIL: gcc.dg/vect/pr52252-ld.c scan-tree-dump-times vect "vectorized 1 loops" 1 > > FAIL: gcc.dg/vect/pr52252-ld.c -flto -ffat-lto-objects scan-tree-dump-times vect "vectorized 1 loops" 1 > > > > The dumps have > > > > /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not vectorized: relevant stmt not supported: in0_9 = *in_27; > > /vol/gcc/src/hg/trunk/local/gcc/testsuite/gcc.dg/vect/pr52252-ld.c:7:1: note: vectorized 0 loops in function. > > > > Rainer > > > > -- > > ----------------------------------------------------------------------------- > > Rainer Orth, Center for Biotechnology, Bielefeld University > >
Evgeny Stupachenko <evstupac@gmail.com> writes: > The test is on general changes. However I was able to test it on x86 only. > I see 2 possible solutions: > 1. Set the test for x86 only. > 2. Modify it so that it will pass on sparc-sun-solaris2. > > If 2. is not acceptable I'll create patch for 1. > Currently I don't see why "in0_9 = *in_27" is not supported. Does the > test fail because of unsupported permutation? The .vect dump has gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: ==> examining statement: in0_9 = *in_27; gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: vect_is_simple_use: operand *in_27 gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not ssa-name. gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: use not simple. gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: vect_is_simple_use: operand *in_27 gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not ssa-name. gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: use not simple. gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: no array mode for V8QI[3] gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: shuffle of 3 loads is not supported by target gcc/testsuite/gcc.dg/vect/pr52252-ld.c:10:3: note: not vectorized: relevant stmt not supported: in0_9 = *in_27; I can send you the full dump if necessary. Rainer
It seems like "shuffle of 3 loads is not supported by target" is the root cause. The permutation like 0, 3, 6, 9, c, .... is not supported by the target.
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 274cdbd..feafb38 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -4812,36 +4812,74 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count) { enum machine_mode mode = TYPE_MODE (vectype); - /* vect_permute_load_chain requires the group size to be a power of two. */ - if (exact_log2 (count) == -1) + /* vect_permute_load_chain requires the group size to be equal to 3 or + be a power of two. */ + if (count != 3 && exact_log2 (count) == -1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "the size of the group of accesses" - " is not a power of 2\n"); + "the size of the group of accesses" + " is not a power of 2 or not eqaul to 3\n"); return false; } /* Check that the permutation is supported. */ if (VECTOR_MODE_P (mode)) { - unsigned int i, nelt = GET_MODE_NUNITS (mode); + unsigned int i, j, nelt = GET_MODE_NUNITS (mode); unsigned char *sel = XALLOCAVEC (unsigned char, nelt); - for (i = 0; i < nelt; i++) - sel[i] = i * 2; - if (can_vec_perm_p (mode, false, sel)) + if (exact_log2 (count) != -1) { for (i = 0; i < nelt; i++) - sel[i] = i * 2 + 1; + sel[i] = i * 2; if (can_vec_perm_p (mode, false, sel)) - return true; + { + for (i = 0; i < nelt; i++) + sel[i] = i * 2 + 1; + if (can_vec_perm_p (mode, false, sel)) + return true; + } + } + else if (count == 3) + { + unsigned int k; + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by \ + target\n"); + return false; + } + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + if (!can_vec_perm_p (mode, false, sel)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "shuffle of 3 loads is not supported by \ + target\n"); + return false; + } + } + return true; } } if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "extract even/odd not supported by target\n"); + "extract even/odd not supported by target\n"); return false; } @@ -4859,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count) /* Function vect_permute_load_chain. Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be - a power of 2, generate extract_even/odd stmts to reorder the input data - correctly. Return the final references for loads in RESULT_CHAIN. + a power of 2 or equal to 3, generate extract_even/odd stmts to reorder + the input data correctly. Return the final references for loads in + RESULT_CHAIN. E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. The input is 4 vectors each containing 8 elements. We assign a number to each @@ -4941,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain, { tree data_ref, first_vect, second_vect; tree perm_mask_even, perm_mask_odd; + tree perm3_mask_low, perm3_mask_high; gimple perm_stmt; tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); unsigned int i, j, log_length = exact_log2 (length); @@ -4951,45 +4991,99 @@ vect_permute_load_chain (vec<tree> dr_chain, memcpy (result_chain->address (), dr_chain.address (), length * sizeof (tree)); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2; - perm_mask_even = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_even != NULL); + if (log_length != (unsigned int)-1) + { + for (i = 0; i < nelt; ++i) + sel[i] = i * 2; + perm_mask_even = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_even != NULL); - for (i = 0; i < nelt; ++i) - sel[i] = i * 2 + 1; - perm_mask_odd = vect_gen_perm_mask (vectype, sel); - gcc_assert (perm_mask_odd != NULL); + for (i = 0; i < nelt; ++i) + sel[i] = i * 2 + 1; + perm_mask_odd = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm_mask_odd != NULL); - for (i = 0; i < log_length; i++) - { - for (j = 0; j < length; j += 2) + for (i = 0; i < log_length; i++) { - first_vect = dr_chain[j]; - second_vect = dr_chain[j+1]; + for (j = 0; j < length; j += 2) + { + first_vect = dr_chain[j]; + second_vect = dr_chain[j+1]; + + /* data_ref = permute_even (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_even); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2] = data_ref; + + /* data_ref = permute_odd (first_data_ref, second_data_ref); */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, + first_vect, second_vect, + perm_mask_odd); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + (*result_chain)[j/2+length/2] = data_ref; + } + memcpy (dr_chain.address (), result_chain->address (), + length * sizeof (tree)); + } + } + /* length is not a power of 2. */ + else + { + unsigned int k; - /* data_ref = permute_even (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); + /* currently only length 3 is supported as most frequent case. */ + gcc_assert (length == 3); + + for (k = 0; k < 3; k++) + { + for (i = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = 3 * i + k; + else + sel[i] = 0; + perm3_mask_low = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_low != NULL); + + for (i = 0, j = 0; i < nelt; i++) + if (3 * i + k < 2 * nelt) + sel[i] = i; + else + sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); + + perm3_mask_high = vect_gen_perm_mask (vectype, sel); + gcc_assert (perm3_mask_high != NULL); + + first_vect = dr_chain[0]; + second_vect = dr_chain[1]; + + /* Create interleaving stmt (low part of): + low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_even); + perm3_mask_low); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2] = data_ref; - /* data_ref = permute_odd (first_data_ref, second_data_ref); */ - data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); + /* Create interleaving stmt (high part of): + high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, + ...}> */ + first_vect = data_ref; + second_vect = dr_chain[2]; + data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high"); perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, first_vect, second_vect, - perm_mask_odd); + perm3_mask_high); vect_finish_stmt_generation (stmt, perm_stmt, gsi); - (*result_chain)[j/2+length/2] = data_ref; + (*result_chain)[k] = data_ref; } - memcpy (dr_chain.address (), result_chain->address (), - length * sizeof (tree)); } } - /* Function vect_transform_grouped_load. Given a chain of input interleaved data-refs (in DR_CHAIN), build statements diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 1a51d6d..b87c143 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, include the cost of the permutes. */ if (!load_lanes_p && group_size > 1) { - /* Uses an even and odd extract operations for each needed permute. */ - int nstmts = ncopies * exact_log2 (group_size) * group_size; - inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm, - stmt_info, 0, vect_body); + /* Uses an even and odd extract operations or shuffle operations + for each needed permute. */ + int nstmts = ncopies * ceil_log2 (group_size) * group_size; + inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, + stmt_info, 0, vect_body); if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c new file mode 100644 index 0000000..6e3cb52 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ + +#define byte unsigned char + +void +matrix_mul (byte *in, byte *out, int size) +{ + int i; + for (i = 0; i < size; i++) + { + byte in0 = in[0]; + byte in1 = in[1]; + byte in2 = in[2]; + byte out0, out1, out2, out3; + out0 = in0 + in1; + out1 = in0 + in2; + out2 = in1 + in2; + out3 = in0 + in1 + in2; + out[0] = out0; + out[1] = out1; + out[2] = out2; + out[3] = out3; + in += 3; + out += 4; + } +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */