[PR,middle-end/71488] Fix vectorization of comparison of booleans

Message ID	20160616110612.GB62320@msticlxl57.ims.intel.com
State	New
Headers	show Return-Path: <gcc-patches-return-429955-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type; q=dns; s= default; b=rnFJwEQHQZq8v2fb0x8uPnOlQaCDIm9IOZNsVBj3KbnS3H10itVKR q1Oa3i9DqUyMh95bKWgZqGXzIU9SkSA5Tf0VAfNta4A1QQlnB9Hp6mTUVAlSdjcM c6/MbNDY34ILsR7A8aRoEhgZ7VIrv3l28NcSv1Br9yfYPfuu6dY0fQ= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Date: Thu, 16 Jun 2016 14:06:12 +0300 From: Ilya Enkovich <enkovich.gnu@gmail.com> To: gcc-patches@gcc.gnu.org Subject: [PATCH, PR middle-end/71488] Fix vectorization of comparison of booleans Message-ID: <20160616110612.GB62320@msticlxl57.ims.intel.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.5.23 (2014-03-12)

diff --git a/gcc/testsuite/g++.dg/pr71488.C b/gcc/testsuite/g++.dg/pr71488.C new file mode 100644 index 0000000..d7d657e --- /dev/null +++ b/gcc/testsuite/g++.dg/pr71488.C @@ -0,0 +1,24 @@ +// PR middle-end/71488 +// { dg-do run } +// { dg-options "-O3 -std=c++11" } +// { dg-additional-options "-march=westmere" { target i?86-*-* x86_64-*-* } } +// { dg-require-effective-target c++11 } + +#include <valarray> + +int var_4 = 1; +long long var_9 = 0; + +int main() { + + std::valarray<std::valarray<long long>> v10; + + v10.resize(1); + v10[0].resize(4); + + for (int i = 0; i < 4; i++) + v10[0][i] = ((var_9 == 0) > unsigned (var_4 == 0)) + (var_9 == 0); + + if (v10[0][0] != 2) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c new file mode 100644 index 0000000..a1e2a24 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c @@ -0,0 +1,252 @@ +/* PR71488 */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_pack_trunc } */ +/* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */ + +int i1, i2; + +void __attribute__((noclone,noinline)) +fn1 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn2 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn3 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn4 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn5 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn6 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn7 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn8 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn9 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn10 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn11 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn12 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn13 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn14 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn15 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn16 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn17 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn18 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +int eq (int i1, int i2) { return i1 == i2; } +int ne (int i1, int i2) { return i1 != i2; } +int lt (int i1, int i2) { return i1 < i2; } +int le (int i1, int i2) { return i1 <= i2; } +int gt (int i1, int i2) { return i1 > i2; } +int ge (int i1, int i2) { return i1 >= i2; } + +typedef int (*cmp_fn)(int, int); + +void +check (int *p, cmp_fn fn) +{ + int i; + + for (i = 0; i < 32; i++) + { + int t1 = ((i % 4) > 1) == 0; + int t2 = (i % 2) == 0; + int res = fn (t1, t2) + t1; + if (p[i] != res) + __builtin_abort (); + } +} + +int +main (int argc, char **argv) +{ + int i1[32], i2[32], res[32]; + short s2[32]; + long long l2[32]; + int i; + + for (i = 0; i < 32; i++) + { + l2[i] = i2[i] = s2[i] = i % 2; + i1[i] = (i % 4) > 1; + asm ("":::"memory"); + } + + fn1 (res, i1, i2, 32); + check (res, gt); + fn2 (res, i1, s2, 32); + check (res, gt); + fn3 (res, i1, l2, 32); + check (res, gt); + + fn4 (res, i1, i2, 32); + check (res, ge); + fn5 (res, i1, s2, 32); + check (res, ge); + fn6 (res, i1, l2, 32); + check (res, ge); + + fn7 (res, i1, i2, 32); + check (res, lt); + fn8 (res, i1, s2, 32); + check (res, lt); + fn9 (res, i1, l2, 32); + check (res, lt); + + fn10 (res, i1, i2, 32); + check (res, le); + fn11 (res, i1, s2, 32); + check (res, le); + fn12 (res, i1, l2, 32); + check (res, le); + + fn13 (res, i1, i2, 32); + check (res, eq); + fn14 (res, i1, s2, 32); + check (res, eq); + fn15 (res, i1, l2, 32); + check (res, eq); + + fn16 (res, i1, i2, 32); + check (res, ne); + fn17 (res, i1, s2, 32); + check (res, ne); + fn18 (res, i1, l2, 32); + check (res, ne); +} + +/* { dg-final { scan-tree-dump-times "VECTORIZED" 18 "vect" { target { i?86-*-* x86_64-*-* } } } } */ diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 8a2221f..f0c515d 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -3763,7 +3763,8 @@ vect_recog_mask_conversion_pattern (vec<gimple *> *stmts, tree *type_in, if (rhs_code != BIT_IOR_EXPR && rhs_code != BIT_XOR_EXPR - && rhs_code != BIT_AND_EXPR) + && rhs_code != BIT_AND_EXPR + && TREE_CODE_CLASS (rhs_code) != tcc_comparison) return NULL; rhs2 = gimple_assign_rhs2 (last_stmt); diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index c74f14f..5c65502 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -7756,7 +7756,7 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type}; unsigned nunits; int ncopies; - enum tree_code code; + enum tree_code code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR; stmt_vec_info prev_stmt_info = NULL; int i, j; bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); @@ -7829,11 +7829,74 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, else if (nunits != TYPE_VECTOR_SUBPARTS (vectype)) return false; + /* Can't compare mask and non-mask types. */ + if (vectype1 && vectype2 + && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2))) + return false; + + /* Boolean values may have another representation in vectors + and therefore we prefer bit operations over comparison for + them (which also works for scalar masks). We store opcodes + to use in bitop1 and bitop2. Statement is vectorized as + BITOP2 (rhs1 BITOP1 rhs2) or + rhs1 BITOP2 (BITOP1 rhs2) + depending on bitop1 and bitop2 arity. */ + if (VECTOR_BOOLEAN_TYPE_P (vectype)) + { + if (code == GT_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_AND_EXPR; + } + else if (code == GE_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_IOR_EXPR; + } + else if (code == LT_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_AND_EXPR; + std::swap (rhs1, rhs2); + } + else if (code == LE_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_IOR_EXPR; + std::swap (rhs1, rhs2); + } + else + { + bitop1 = BIT_XOR_EXPR; + if (code == EQ_EXPR) + bitop2 = BIT_NOT_EXPR; + } + } + if (!vec_stmt) { STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type; - vect_model_simple_cost (stmt_info, ncopies, dts, NULL, NULL); - return expand_vec_cmp_expr_p (vectype, mask_type); + vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)), + dts, NULL, NULL); + if (bitop1 == NOP_EXPR) + return expand_vec_cmp_expr_p (vectype, mask_type); + else + { + machine_mode mode = TYPE_MODE (vectype); + optab optab; + + optab = optab_for_tree_code (bitop1, vectype, optab_default); + if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing) + return false; + + if (bitop2 != NOP_EXPR) + { + optab = optab_for_tree_code (bitop2, vectype, optab_default); + if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing) + return false; + } + return true; + } } /* Transform. */ @@ -7890,8 +7953,31 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, vec_rhs2 = vec_oprnds1[i]; new_temp = make_ssa_name (mask); - new_stmt = gimple_build_assign (new_temp, code, vec_rhs1, vec_rhs2); - vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (bitop1 == NOP_EXPR) + { + new_stmt = gimple_build_assign (new_temp, code, + vec_rhs1, vec_rhs2); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + } + else + { + if (bitop1 == BIT_NOT_EXPR) + new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2); + else + new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1, + vec_rhs2); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (bitop2 != NOP_EXPR) + { + tree res = make_ssa_name (mask); + if (bitop2 == BIT_NOT_EXPR) + new_stmt = gimple_build_assign (res, bitop2, new_temp); + else + new_stmt = gimple_build_assign (res, bitop2, vec_rhs1, + new_temp); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + } + } if (slp_node) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); }

[PR,middle-end/71488] Fix vectorization of comparison of booleans

Commit Message

Comments

Patch