Message ID | OF68484037.2A6E3403-ONC22577B3.003FF33D-C22577B3.004224C6@il.ibm.com |
---|---|
State | New |
Headers | show |
On Tue, Oct 5, 2010 at 2:02 PM, Ira Rosen <IRAR@il.ibm.com> wrote: > > Hi, > > In function vect_get_mask_element several variables were declared as > static, which is problematic when there are more than one SLP instances in > a loop. > > Both 4.5 and 4.6 patches were bootstrapped and tested on x86_64-suse-linux. > > Applied to trunk. OK for 4.5? Ok. Thanks, Richard. > Thanks, > Ira > > > 4.5/4.6 > ChangeLog: > > PR tree-optimization/45752 > * tree-vect-slp.c (vect_get_mask_element): Remove static > variables, make them function arguments. > (vect_transform_slp_perm_load): Pass new arguments to > vect_get_mask_element. > > testsuite/ChangeLog: > > PR tree-optimization/45752 > * gcc.dg/vect/pr45752.c: New test. > > > 4.6 patch: > > Index: testsuite/gcc.dg/vect/pr45752.c > =================================================================== > --- testsuite/gcc.dg/vect/pr45752.c (revision 0) > +++ testsuite/gcc.dg/vect/pr45752.c (revision 0) > @@ -0,0 +1,109 @@ > +/* { dg-require-effective-target vect_int } */ > + > +#include <stdarg.h> > +#include <stdio.h> > +#include "tree-vect.h" > + > +#define M00 100 > +#define M10 216 > +#define M20 23 > +#define M30 237 > +#define M40 437 > + > +#define M01 1322 > +#define M11 13 > +#define M21 27271 > +#define M31 2280 > +#define M41 284 > + > +#define M02 74 > +#define M12 191 > +#define M22 500 > +#define M32 111 > +#define M42 1114 > + > +#define M03 134 > +#define M13 117 > +#define M23 11 > +#define M33 771 > +#define M43 71 > + > +#define M04 334 > +#define M14 147 > +#define M24 115 > +#define M34 7716 > +#define M44 16 > + > +#define N 16 > + > +void foo (unsigned int *__restrict__ pInput, > + unsigned int *__restrict__ pOutput, > + unsigned int *__restrict__ pInput2, > + unsigned int *__restrict__ pOutput2) > +{ > + unsigned int i, a, b, c, d, e; > + > + for (i = 0; i < N / 5; i++) > + { > + a = *pInput++; > + b = *pInput++; > + c = *pInput++; > + d = *pInput++; > + e = *pInput++; > + > + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; > + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; > + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; > + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; > + *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; > + > + > + a = *pInput2++; > + b = *pInput2++; > + c = *pInput2++; > + d = *pInput2++; > + e = *pInput2++; > + > + *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; > + *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; > + *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; > + *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; > + *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; > + > + } > +} > + > +int main (int argc, const char* argv[]) > +{ > + unsigned int input[N], output[N], i, input2[N], output2[N]; > + unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, > + 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0}; > + unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956, > + 6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0}; > + > + check_vect (); > + > + for (i = 0; i < N; i++) > + { > + input[i] = i%256; > + input2[i] = i + 2; > + output[i] = 0; > + output2[i] = 0; > + __asm__ volatile (""); > + } > + > + foo (input, output, input2, output2); > + > + for (i = 0; i < N; i++) > + if (output[i] != check_results[i] > + || output2[i] != check_results2[i]) > + abort (); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "permutation requires at least three > vectors" 2 "vect" { target vect_perm } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 > "vect" } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > + > Index: tree-vect-slp.c > =================================================================== > --- tree-vect-slp.c (revision 164986) > +++ tree-vect-slp.c (working copy) > @@ -2177,20 +2177,18 @@ static bool > vect_get_mask_element (gimple stmt, int first_mask_element, int m, > int mask_nunits, bool only_one_vec, int index, > int *mask, int *current_mask_element, > - bool *need_next_vector) > + bool *need_next_vector, int *number_of_mask_fixes, > + bool *mask_fixed, bool *needs_first_vector) > { > int i; > - static int number_of_mask_fixes = 1; > - static bool mask_fixed = false; > - static bool needs_first_vector = false; > > /* Convert to target specific representation. */ > *current_mask_element = first_mask_element + m; > /* Adjust the value in case it's a mask for second and third vectors. > */ > - *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); > + *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1); > > if (*current_mask_element < mask_nunits) > - needs_first_vector = true; > + *needs_first_vector = true; > > /* We have only one input vector to permute but the mask accesses values > in > the next vector as well. */ > @@ -2208,7 +2206,7 @@ vect_get_mask_element (gimple stmt, int > /* The mask requires the next vector. */ > if (*current_mask_element >= mask_nunits * 2) > { > - if (needs_first_vector || mask_fixed) > + if (*needs_first_vector || *mask_fixed) > { > /* We either need the first vector too or have already moved to > the > next vector. In both cases, this permutation needs three > @@ -2226,23 +2224,23 @@ vect_get_mask_element (gimple stmt, int > /* We move to the next vector, dropping the first one and working > with > the second and the third - we need to adjust the values of the > mask > accordingly. */ > - *current_mask_element -= mask_nunits * number_of_mask_fixes; > + *current_mask_element -= mask_nunits * *number_of_mask_fixes; > > for (i = 0; i < index; i++) > - mask[i] -= mask_nunits * number_of_mask_fixes; > + mask[i] -= mask_nunits * *number_of_mask_fixes; > > - (number_of_mask_fixes)++; > - mask_fixed = true; > + (*number_of_mask_fixes)++; > + *mask_fixed = true; > } > > - *need_next_vector = mask_fixed; > + *need_next_vector = *mask_fixed; > > /* This was the last element of this mask. Start a new one. */ > if (index == mask_nunits - 1) > { > - number_of_mask_fixes = 1; > - mask_fixed = false; > - needs_first_vector = false; > + *number_of_mask_fixes = 1; > + *mask_fixed = false; > + *needs_first_vector = false; > } > > return true; > @@ -2268,6 +2266,9 @@ vect_transform_slp_perm_load (gimple stm > int index, unroll_factor, *mask, current_mask_element, ncopies; > bool only_one_vec = false, need_next_vector = false; > int first_vec_index, second_vec_index, orig_vec_stmts_num, > vect_stmts_counter; > + int number_of_mask_fixes = 1; > + bool mask_fixed = false; > + bool needs_first_vector = false; > > if (!targetm.vectorize.builtin_vec_perm) > { > @@ -2351,7 +2352,9 @@ vect_transform_slp_perm_load (gimple stm > { > if (!vect_get_mask_element (stmt, first_mask_element, m, > mask_nunits, only_one_vec, index, mask, > - ¤t_mask_element, > &need_next_vector)) > + ¤t_mask_element, > &need_next_vector, > + &number_of_mask_fixes, &mask_fixed, > + &needs_first_vector)) > return false; > > mask[index++] = current_mask_element; > > 4.5 patch: > > Index: testsuite/gcc.dg/vect/pr45752.c > =================================================================== > --- testsuite/gcc.dg/vect/pr45752.c (revision 0) > +++ testsuite/gcc.dg/vect/pr45752.c (revision 0) > @@ -0,0 +1,109 @@ > +/* { dg-require-effective-target vect_int } */ > + > +#include <stdarg.h> > +#include <stdio.h> > +#include "tree-vect.h" > + > +#define M00 100 > +#define M10 216 > +#define M20 23 > +#define M30 237 > +#define M40 437 > + > +#define M01 1322 > +#define M11 13 > +#define M21 27271 > +#define M31 2280 > +#define M41 284 > + > +#define M02 74 > +#define M12 191 > +#define M22 500 > +#define M32 111 > +#define M42 1114 > + > +#define M03 134 > +#define M13 117 > +#define M23 11 > +#define M33 771 > +#define M43 71 > + > +#define M04 334 > +#define M14 147 > +#define M24 115 > +#define M34 7716 > +#define M44 16 > + > +#define N 16 > + > +void foo (unsigned int *__restrict__ pInput, > + unsigned int *__restrict__ pOutput, > + unsigned int *__restrict__ pInput2, > + unsigned int *__restrict__ pOutput2) > +{ > + unsigned int i, a, b, c, d, e; > + > + for (i = 0; i < N / 5; i++) > + { > + a = *pInput++; > + b = *pInput++; > + c = *pInput++; > + d = *pInput++; > + e = *pInput++; > + > + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; > + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; > + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; > + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; > + *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; > + > + > + a = *pInput2++; > + b = *pInput2++; > + c = *pInput2++; > + d = *pInput2++; > + e = *pInput2++; > + > + *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; > + *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; > + *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; > + *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; > + *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; > + > + } > +} > + > +int main (int argc, const char* argv[]) > +{ > + unsigned int input[N], output[N], i, input2[N], output2[N]; > + unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, > + 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0}; > + unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956, > + 6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0}; > + > + check_vect (); > + > + for (i = 0; i < N; i++) > + { > + input[i] = i%256; > + input2[i] = i + 2; > + output[i] = 0; > + output2[i] = 0; > + __asm__ volatile (""); > + } > + > + foo (input, output, input2, output2); > + > + for (i = 0; i < N; i++) > + if (output[i] != check_results[i] > + || output2[i] != check_results2[i]) > + abort (); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ > +/* { dg-final { scan-tree-dump-times "permutation requires at least three > vectors" 2 "vect" { target vect_perm } } } */ > +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 > "vect" } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */ > + > Index: tree-vect-slp.c > =================================================================== > --- tree-vect-slp.c (revision 164986) > +++ tree-vect-slp.c (working copy) > @@ -1714,20 +1714,18 @@ static bool > vect_get_mask_element (gimple stmt, int first_mask_element, int m, > int mask_nunits, bool only_one_vec, int index, > int *mask, int *current_mask_element, > - bool *need_next_vector) > + bool *need_next_vector, int *number_of_mask_fixes, > + bool *mask_fixed, bool *needs_first_vector) > { > int i; > - static int number_of_mask_fixes = 1; > - static bool mask_fixed = false; > - static bool needs_first_vector = false; > > /* Convert to target specific representation. */ > *current_mask_element = first_mask_element + m; > /* Adjust the value in case it's a mask for second and third vectors. > */ > - *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); > + *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1); > > if (*current_mask_element < mask_nunits) > - needs_first_vector = true; > + *needs_first_vector = true; > > /* We have only one input vector to permute but the mask accesses values > in > the next vector as well. */ > @@ -1745,7 +1743,7 @@ vect_get_mask_element (gimple stmt, int > /* The mask requires the next vector. */ > if (*current_mask_element >= mask_nunits * 2) > { > - if (needs_first_vector || mask_fixed) > + if (*needs_first_vector || *mask_fixed) > { > /* We either need the first vector too or have already moved to > the > next vector. In both cases, this permutation needs three > @@ -1763,23 +1761,23 @@ vect_get_mask_element (gimple stmt, int > /* We move to the next vector, dropping the first one and working > with > the second and the third - we need to adjust the values of the > mask > accordingly. */ > - *current_mask_element -= mask_nunits * number_of_mask_fixes; > + *current_mask_element -= mask_nunits * *number_of_mask_fixes; > > for (i = 0; i < index; i++) > - mask[i] -= mask_nunits * number_of_mask_fixes; > + mask[i] -= mask_nunits * *number_of_mask_fixes; > > - (number_of_mask_fixes)++; > - mask_fixed = true; > + (*number_of_mask_fixes)++; > + *mask_fixed = true; > } > > - *need_next_vector = mask_fixed; > + *need_next_vector = *mask_fixed; > > /* This was the last element of this mask. Start a new one. */ > if (index == mask_nunits - 1) > { > - number_of_mask_fixes = 1; > - mask_fixed = false; > - needs_first_vector = false; > + *number_of_mask_fixes = 1; > + *mask_fixed = false; > + *needs_first_vector = false; > } > > return true; > @@ -1805,6 +1803,9 @@ vect_transform_slp_perm_load (gimple stm > int index, unroll_factor, *mask, current_mask_element, ncopies; > bool only_one_vec = false, need_next_vector = false; > int first_vec_index, second_vec_index, orig_vec_stmts_num, > vect_stmts_counter; > + int number_of_mask_fixes = 1; > + bool mask_fixed = false; > + bool needs_first_vector = false; > > if (!targetm.vectorize.builtin_vec_perm) > { > @@ -1891,7 +1892,9 @@ vect_transform_slp_perm_load (gimple stm > { > if (!vect_get_mask_element (stmt, first_mask_element, m, > mask_nunits, only_one_vec, index, mask, > - ¤t_mask_element, > &need_next_vector)) > + ¤t_mask_element, > &need_next_vector, > + &number_of_mask_fixes, &mask_fixed, > + &needs_first_vector)) > return false; > > mask[index++] = current_mask_element; > >
Index: testsuite/gcc.dg/vect/pr45752.c =================================================================== --- testsuite/gcc.dg/vect/pr45752.c (revision 0) +++ testsuite/gcc.dg/vect/pr45752.c (revision 0) @@ -0,0 +1,109 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M30 237 +#define M40 437 + +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M31 2280 +#define M41 284 + +#define M02 74 +#define M12 191 +#define M22 500 +#define M32 111 +#define M42 1114 + +#define M03 134 +#define M13 117 +#define M23 11 +#define M33 771 +#define M43 71 + +#define M04 334 +#define M14 147 +#define M24 115 +#define M34 7716 +#define M44 16 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, + unsigned int *__restrict__ pOutput, + unsigned int *__restrict__ pInput2, + unsigned int *__restrict__ pOutput2) +{ + unsigned int i, a, b, c, d, e; + + for (i = 0; i < N / 5; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + d = *pInput++; + e = *pInput++; + + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; + *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; + + + a = *pInput2++; + b = *pInput2++; + c = *pInput2++; + d = *pInput2++; + e = *pInput2++; + + *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; + *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; + *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; + *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; + *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; + + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i, input2[N], output2[N]; + unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, + 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0}; + unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956, + 6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0}; + + check_vect (); + + for (i = 0; i < N; i++) + { + input[i] = i%256; + input2[i] = i + 2; + output[i] = 0; + output2[i] = 0; + __asm__ volatile (""); + } + + foo (input, output, input2, output2); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i] + || output2[i] != check_results2[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 2 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + Index: tree-vect-slp.c =================================================================== --- tree-vect-slp.c (revision 164986) +++ tree-vect-slp.c (working copy) @@ -2177,20 +2177,18 @@ static bool vect_get_mask_element (gimple stmt, int first_mask_element, int m, int mask_nunits, bool only_one_vec, int index, int *mask, int *current_mask_element, - bool *need_next_vector) + bool *need_next_vector, int *number_of_mask_fixes, + bool *mask_fixed, bool *needs_first_vector) { int i; - static int number_of_mask_fixes = 1; - static bool mask_fixed = false; - static bool needs_first_vector = false; /* Convert to target specific representation. */ *current_mask_element = first_mask_element + m; /* Adjust the value in case it's a mask for second and third vectors. */ - *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); + *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1); if (*current_mask_element < mask_nunits) - needs_first_vector = true; + *needs_first_vector = true; /* We have only one input vector to permute but the mask accesses values in the next vector as well. */ @@ -2208,7 +2206,7 @@ vect_get_mask_element (gimple stmt, int /* The mask requires the next vector. */ if (*current_mask_element >= mask_nunits * 2) { - if (needs_first_vector || mask_fixed) + if (*needs_first_vector || *mask_fixed) { /* We either need the first vector too or have already moved to the next vector. In both cases, this permutation needs three @@ -2226,23 +2224,23 @@ vect_get_mask_element (gimple stmt, int /* We move to the next vector, dropping the first one and working with the second and the third - we need to adjust the values of the mask accordingly. */ - *current_mask_element -= mask_nunits * number_of_mask_fixes; + *current_mask_element -= mask_nunits * *number_of_mask_fixes; for (i = 0; i < index; i++) - mask[i] -= mask_nunits * number_of_mask_fixes; + mask[i] -= mask_nunits * *number_of_mask_fixes; - (number_of_mask_fixes)++; - mask_fixed = true; + (*number_of_mask_fixes)++; + *mask_fixed = true; } - *need_next_vector = mask_fixed; + *need_next_vector = *mask_fixed; /* This was the last element of this mask. Start a new one. */ if (index == mask_nunits - 1) { - number_of_mask_fixes = 1; - mask_fixed = false; - needs_first_vector = false; + *number_of_mask_fixes = 1; + *mask_fixed = false; + *needs_first_vector = false; } return true; @@ -2268,6 +2266,9 @@ vect_transform_slp_perm_load (gimple stm int index, unroll_factor, *mask, current_mask_element, ncopies; bool only_one_vec = false, need_next_vector = false; int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter; + int number_of_mask_fixes = 1; + bool mask_fixed = false; + bool needs_first_vector = false; if (!targetm.vectorize.builtin_vec_perm) { @@ -2351,7 +2352,9 @@ vect_transform_slp_perm_load (gimple stm { if (!vect_get_mask_element (stmt, first_mask_element, m, mask_nunits, only_one_vec, index, mask, - ¤t_mask_element, &need_next_vector)) + ¤t_mask_element, &need_next_vector, + &number_of_mask_fixes, &mask_fixed, + &needs_first_vector)) return false; mask[index++] = current_mask_element; 4.5 patch: Index: testsuite/gcc.dg/vect/pr45752.c =================================================================== --- testsuite/gcc.dg/vect/pr45752.c (revision 0) +++ testsuite/gcc.dg/vect/pr45752.c (revision 0) @@ -0,0 +1,109 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M30 237 +#define M40 437 + +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M31 2280 +#define M41 284 + +#define M02 74 +#define M12 191 +#define M22 500 +#define M32 111 +#define M42 1114 + +#define M03 134 +#define M13 117 +#define M23 11 +#define M33 771 +#define M43 71 + +#define M04 334 +#define M14 147 +#define M24 115 +#define M34 7716 +#define M44 16 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, + unsigned int *__restrict__ pOutput, + unsigned int *__restrict__ pInput2, + unsigned int *__restrict__ pOutput2) +{ + unsigned int i, a, b, c, d, e; + + for (i = 0; i < N / 5; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + d = *pInput++; + e = *pInput++; + + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; + *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; + + + a = *pInput2++; + b = *pInput2++; + c = *pInput2++; + d = *pInput2++; + e = *pInput2++; + + *pOutput2++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; + *pOutput2++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; + *pOutput2++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; + *pOutput2++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; + *pOutput2++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; + + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i, input2[N], output2[N]; + unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, + 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0}; + unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956, + 6122, 224204, 113484, 16243, 26776, 9542, 363804, 169059, 25853, 0}; + + check_vect (); + + for (i = 0; i < N; i++) + { + input[i] = i%256; + input2[i] = i + 2; + output[i] = 0; + output2[i] = 0; + __asm__ volatile (""); + } + + foo (input, output, input2, output2); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i] + || output2[i] != check_results2[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 2 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + Index: tree-vect-slp.c =================================================================== --- tree-vect-slp.c (revision 164986) +++ tree-vect-slp.c (working copy) @@ -1714,20 +1714,18 @@ static bool vect_get_mask_element (gimple stmt, int first_mask_element, int m, int mask_nunits, bool only_one_vec, int index, int *mask, int *current_mask_element, - bool *need_next_vector) + bool *need_next_vector, int *number_of_mask_fixes, + bool *mask_fixed, bool *needs_first_vector) { int i; - static int number_of_mask_fixes = 1; - static bool mask_fixed = false; - static bool needs_first_vector = false; /* Convert to target specific representation. */ *current_mask_element = first_mask_element + m; /* Adjust the value in case it's a mask for second and third vectors. */ - *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); + *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1); if (*current_mask_element < mask_nunits) - needs_first_vector = true; + *needs_first_vector = true; /* We have only one input vector to permute but the mask accesses values