Message ID | 20240624135510.3509497-1-pan2.li@intel.com |
---|---|
State | New |
Headers | show |
Series | [v2] Vect: Support truncate after .SAT_SUB pattern in zip | expand |
Hi, > -----Original Message----- > From: pan2.li@intel.com <pan2.li@intel.com> > Sent: Monday, June 24, 2024 2:55 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > I guess this is because one branch of the cond is a constant so the convert is folded in. I was wondering though, can't we just push in the truncate in this case? i.e. in this case we know both types are unsigned and the difference positive and max value is the max value of the truncate type. It seems like folding as a general rule _1 = *p_10; a_11 = (unsigned int) _1; _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = _18 ? iftmp.0_13 : 0; *p_10 = iftmp.0_5; Into _1 = *p_10; a_11 = (unsigned int) _1; _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); iftmp.0_13 = _2; _18 = a_11 >= b_12(D); iftmp.0_5 = _18 ? iftmp.0_13 : 0; *p_10 = iftmp.0_5; Is valid (though might have missed something). This would negate the need for this change to the vectorizer and saturation detection but also should generate better vector code. This is what we do in the general case https://godbolt.org/z/dfoj6fWdv I think here we're just not seeing through the cond. Typically lots of architectures have cheap truncation operations, so truncating before saturation means you do the cheap operation first rather than doing the complex operation on the wider type. That is, _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); is cheaper than _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); after vectorization. Normally the vectorizer will try to do this through over-widening detection as well, but we haven't taught ranger about the ranges of these new IFNs (probably should at some point). Cheers, Tamar > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1
Thanks Tamar for comments. It indeed benefits the vectorized code, for example in RISC-V, we may eliminate some vsetvel insn in loop for widen here. > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > is cheaper than > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); I am not sure if it has any correctness problem for this transform, take uint16_t to uint8_t as example. uint16_t a, b; uint8_t result = (uint8_t)(a >= b ? a - b : 0); Given a = 0x100; // 256 b = 0xff; // 255 For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, 255) = 0 For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, 255) = 1 Please help to correct me if any misunderstanding, thanks again for enlightening. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Tuesday, June 25, 2024 4:00 AM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip Hi, > -----Original Message----- > From: pan2.li@intel.com <pan2.li@intel.com> > Sent: Monday, June 24, 2024 2:55 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > I guess this is because one branch of the cond is a constant so the convert is folded in. I was wondering though, can't we just push in the truncate in this case? i.e. in this case we know both types are unsigned and the difference positive and max value is the max value of the truncate type. It seems like folding as a general rule _1 = *p_10; a_11 = (unsigned int) _1; _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = _18 ? iftmp.0_13 : 0; *p_10 = iftmp.0_5; Into _1 = *p_10; a_11 = (unsigned int) _1; _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); iftmp.0_13 = _2; _18 = a_11 >= b_12(D); iftmp.0_5 = _18 ? iftmp.0_13 : 0; *p_10 = iftmp.0_5; Is valid (though might have missed something). This would negate the need for this change to the vectorizer and saturation detection but also should generate better vector code. This is what we do in the general case https://godbolt.org/z/dfoj6fWdv I think here we're just not seeing through the cond. Typically lots of architectures have cheap truncation operations, so truncating before saturation means you do the cheap operation first rather than doing the complex operation on the wider type. That is, _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); is cheaper than _2 = a_11 - b_12(D); iftmp.0_13 = (short unsigned int) _2; _18 = a_11 >= b_12(D); iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); after vectorization. Normally the vectorizer will try to do this through over-widening detection as well, but we haven't taught ranger about the ranges of these new IFNs (probably should at some point). Cheers, Tamar > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1
> -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Tuesday, June 25, 2024 3:25 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > Thanks Tamar for comments. It indeed benefits the vectorized code, for example in > RISC-V, we may eliminate some vsetvel insn in loop for widen here. > > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > is cheaper than > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > I am not sure if it has any correctness problem for this transform, take uint16_t to > uint8_t as example. > > uint16_t a, b; > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > Given a = 0x100; // 256 > b = 0xff; // 255 > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, > 255) = 0 > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, > 255) = 1 > > Please help to correct me if any misunderstanding, thanks again for enlightening. Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should have though it through more. Tamar. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, June 25, 2024 4:00 AM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > Hi, > > > -----Original Message----- > > From: pan2.li@intel.com <pan2.li@intel.com> > > Sent: Monday, June 24, 2024 2:55 PM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > From: Pan Li <pan2.li@intel.com> > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > truncated as below: > > > > void test (uint16_t *x, unsigned b, unsigned n) > > { > > unsigned a = 0; > > register uint16_t *p = x; > > > > do { > > a = *--p; > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > } while (--n); > > } > > > > It will have gimple before vect pass, it cannot hit any pattern of > > SAT_SUB and then cannot vectorize to SAT_SUB. > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > > > This patch would like to improve the pattern match to recog above > > as truncate after .SAT_SUB pattern. Then we will have the pattern > > similar to below, as well as eliminate the first 3 dead stmt. > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > I guess this is because one branch of the cond is a constant so the > convert is folded in. I was wondering though, can't we just push > in the truncate in this case? > > i.e. in this case we know both types are unsigned and the difference > positive and max value is the max value of the truncate type. > > It seems like folding as a general rule > > _1 = *p_10; > a_11 = (unsigned int) _1; > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > Into > > _1 = *p_10; > a_11 = (unsigned int) _1; > _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); > iftmp.0_13 = _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > Is valid (though might have missed something). This would negate the need for > this change to the vectorizer and saturation detection > but also should generate better vector code. This is what we do in the general case > https://godbolt.org/z/dfoj6fWdv > I think here we're just not seeing through the cond. > > Typically lots of architectures have cheap truncation operations, so truncating > before saturation means you do the cheap > operation first rather than doing the complex operation on the wider type. > > That is, > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > is cheaper than > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > after vectorization. Normally the vectorizer will try to do this through over- > widening detection as well, > but we haven't taught ranger about the ranges of these new IFNs (probably > should at some point). > > Cheers, > Tamar > > > The below tests are passed for this patch. > > 1. The rv64gcv fully regression tests. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap tests. > > 4. The x86 fully regression tests. > > > > gcc/ChangeLog: > > > > * match.pd: Add convert description for minus and capture. > > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > > new logic to handle in_type is incompatibile with out_type, as > > well as rename from. > > (vect_recog_build_binary_gimple_stmt): Rename to. > > (vect_recog_sat_add_pattern): Leverage above renamed func. > > (vect_recog_sat_sub_pattern): Ditto. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 4 +-- > > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > > 2 files changed, 33 insertions(+), 22 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 3d0689c9312..4a4b0b2e72f 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > > integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > - && types_match (type, @0, @1)))) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > > > /* Unsigned saturation sub, case 3 (branchless with gt): > > SAT_U_SUB = (X - Y) * (X > Y). */ > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > index cef901808eb..3d887d36050 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > > > -static gcall * > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > > +static gimple * > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info > stmt_info, > > internal_fn fn, tree *type_out, > > - tree op_0, tree op_1) > > + tree lhs, tree op_0, tree op_1) > > { > > tree itype = TREE_TYPE (op_0); > > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > > + tree otype = TREE_TYPE (lhs); > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > > > - if (vtype != NULL_TREE > > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > > { > > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > > + gimple_call_set_lhs (call, in_ssa); > > gimple_call_set_nothrow (call, /* nothrow_p */ false); > > - gimple_set_location (call, gimple_location (stmt)); > > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT > (stmt_info))); > > + > > + *type_out = v_otype; > > > > - *type_out = vtype; > > + if (types_compatible_p (itype, otype)) > > + return call; > > + else > > + { > > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > > > - return call; > > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > > + } > > } > > > > return NULL; > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > > stmt_vec_info stmt_vinfo, > > > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > > { > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > - IFN_SAT_ADD, type_out, > > - ops[0], ops[1]); > > - if (call) > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > + IFN_SAT_ADD, type_out, > > + lhs, ops[0], ops[1]); > > + if (stmt) > > { > > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > > - return call; > > + return stmt; > > } > > } > > > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > > stmt_vec_info stmt_vinfo, > > > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > > { > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > - IFN_SAT_SUB, type_out, > > - ops[0], ops[1]); > > - if (call) > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > + IFN_SAT_SUB, type_out, > > + lhs, ops[0], ops[1]); > > + if (stmt) > > { > > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > > - return call; > > + return stmt; > > } > > } > > > > -- > > 2.34.1
> Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should have > though it through more. Never mind, but you enlighten me for even more optimize with some restrictions. I revisited the pattern, for example as below. uint16_t a, b; uint8_t result = (uint8_t)(a >= b ? a - b : 0); => result = (char unsigned).SAT_SUB (a, b) If a has a def like below uint8_t other = 0x1f; a = (uint8_t)other then we can safely convert result = (char unsigned).SAT_SUB (a, b) to result = .SAT_SUB ((char unsigned)a, (char unsigned).b) Then we may have better vectorized code if a is limited to char unsigned. Of course we can do that based on this patch. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Tuesday, June 25, 2024 12:01 PM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Tuesday, June 25, 2024 3:25 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > Thanks Tamar for comments. It indeed benefits the vectorized code, for example in > RISC-V, we may eliminate some vsetvel insn in loop for widen here. > > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > is cheaper than > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > I am not sure if it has any correctness problem for this transform, take uint16_t to > uint8_t as example. > > uint16_t a, b; > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > Given a = 0x100; // 256 > b = 0xff; // 255 > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, > 255) = 0 > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, > 255) = 1 > > Please help to correct me if any misunderstanding, thanks again for enlightening. Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should have though it through more. Tamar. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, June 25, 2024 4:00 AM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > Hi, > > > -----Original Message----- > > From: pan2.li@intel.com <pan2.li@intel.com> > > Sent: Monday, June 24, 2024 2:55 PM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > From: Pan Li <pan2.li@intel.com> > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > truncated as below: > > > > void test (uint16_t *x, unsigned b, unsigned n) > > { > > unsigned a = 0; > > register uint16_t *p = x; > > > > do { > > a = *--p; > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > } while (--n); > > } > > > > It will have gimple before vect pass, it cannot hit any pattern of > > SAT_SUB and then cannot vectorize to SAT_SUB. > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > > > This patch would like to improve the pattern match to recog above > > as truncate after .SAT_SUB pattern. Then we will have the pattern > > similar to below, as well as eliminate the first 3 dead stmt. > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > I guess this is because one branch of the cond is a constant so the > convert is folded in. I was wondering though, can't we just push > in the truncate in this case? > > i.e. in this case we know both types are unsigned and the difference > positive and max value is the max value of the truncate type. > > It seems like folding as a general rule > > _1 = *p_10; > a_11 = (unsigned int) _1; > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > Into > > _1 = *p_10; > a_11 = (unsigned int) _1; > _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); > iftmp.0_13 = _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > *p_10 = iftmp.0_5; > > Is valid (though might have missed something). This would negate the need for > this change to the vectorizer and saturation detection > but also should generate better vector code. This is what we do in the general case > https://godbolt.org/z/dfoj6fWdv > I think here we're just not seeing through the cond. > > Typically lots of architectures have cheap truncation operations, so truncating > before saturation means you do the cheap > operation first rather than doing the complex operation on the wider type. > > That is, > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > is cheaper than > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > after vectorization. Normally the vectorizer will try to do this through over- > widening detection as well, > but we haven't taught ranger about the ranges of these new IFNs (probably > should at some point). > > Cheers, > Tamar > > > The below tests are passed for this patch. > > 1. The rv64gcv fully regression tests. > > 2. The rv64gcv build with glibc. > > 3. The x86 bootstrap tests. > > 4. The x86 fully regression tests. > > > > gcc/ChangeLog: > > > > * match.pd: Add convert description for minus and capture. > > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > > new logic to handle in_type is incompatibile with out_type, as > > well as rename from. > > (vect_recog_build_binary_gimple_stmt): Rename to. > > (vect_recog_sat_add_pattern): Leverage above renamed func. > > (vect_recog_sat_sub_pattern): Ditto. > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > --- > > gcc/match.pd | 4 +-- > > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > > 2 files changed, 33 insertions(+), 22 deletions(-) > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > index 3d0689c9312..4a4b0b2e72f 100644 > > --- a/gcc/match.pd > > +++ b/gcc/match.pd > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > /* Unsigned saturation sub, case 2 (branch with ge): > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > (match (unsigned_integer_sat_sub @0 @1) > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > > integer_zerop) > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > - && types_match (type, @0, @1)))) > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > > > /* Unsigned saturation sub, case 3 (branchless with gt): > > SAT_U_SUB = (X - Y) * (X > Y). */ > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > index cef901808eb..3d887d36050 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > > > -static gcall * > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > > +static gimple * > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info > stmt_info, > > internal_fn fn, tree *type_out, > > - tree op_0, tree op_1) > > + tree lhs, tree op_0, tree op_1) > > { > > tree itype = TREE_TYPE (op_0); > > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > > + tree otype = TREE_TYPE (lhs); > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > > > - if (vtype != NULL_TREE > > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > > { > > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > > + gimple_call_set_lhs (call, in_ssa); > > gimple_call_set_nothrow (call, /* nothrow_p */ false); > > - gimple_set_location (call, gimple_location (stmt)); > > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT > (stmt_info))); > > + > > + *type_out = v_otype; > > > > - *type_out = vtype; > > + if (types_compatible_p (itype, otype)) > > + return call; > > + else > > + { > > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > > > - return call; > > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > > + } > > } > > > > return NULL; > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > > stmt_vec_info stmt_vinfo, > > > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > > { > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > - IFN_SAT_ADD, type_out, > > - ops[0], ops[1]); > > - if (call) > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > + IFN_SAT_ADD, type_out, > > + lhs, ops[0], ops[1]); > > + if (stmt) > > { > > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > > - return call; > > + return stmt; > > } > > } > > > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > > stmt_vec_info stmt_vinfo, > > > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > > { > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > - IFN_SAT_SUB, type_out, > > - ops[0], ops[1]); > > - if (call) > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > + IFN_SAT_SUB, type_out, > > + lhs, ops[0], ops[1]); > > + if (stmt) > > { > > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > > - return call; > > + return stmt; > > } > > } > > > > -- > > 2.34.1
> -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Tuesday, June 25, 2024 7:06 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > > though it through more. > > Never mind, but you enlighten me for even more optimize with some restrictions. I > revisited the pattern, for example as below. > > uint16_t a, b; > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > => result = (char unsigned).SAT_SUB (a, b) > > If a has a def like below > uint8_t other = 0x1f; > a = (uint8_t)other You can in principle do this by querying range information, e.g. gimple_ranger ranger; int_range_max r; if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ()) { ... We do this for instance in vect_recog_divmod_pattern. Tamar > > then we can safely convert result = (char unsigned).SAT_SUB (a, b) to > result = .SAT_SUB ((char unsigned)a, (char unsigned).b) > > Then we may have better vectorized code if a is limited to char unsigned. Of course > we can do that based on this patch. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, June 25, 2024 12:01 PM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Tuesday, June 25, 2024 3:25 AM > > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Thanks Tamar for comments. It indeed benefits the vectorized code, for example > in > > RISC-V, we may eliminate some vsetvel insn in loop for widen here. > > > > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) > b_12(D)); > > > is cheaper than > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > I am not sure if it has any correctness problem for this transform, take uint16_t > to > > uint8_t as example. > > > > uint16_t a, b; > > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > > > Given a = 0x100; // 256 > > b = 0xff; // 255 > > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, > > 255) = 0 > > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, > > 255) = 1 > > > > Please help to correct me if any misunderstanding, thanks again for enlightening. > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > though it through more. > > Tamar. > > > > Pan > > > > -----Original Message----- > > From: Tamar Christina <Tamar.Christina@arm.com> > > Sent: Tuesday, June 25, 2024 4:00 AM > > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Hi, > > > > > -----Original Message----- > > > From: pan2.li@intel.com <pan2.li@intel.com> > > > Sent: Monday, June 24, 2024 2:55 PM > > > To: gcc-patches@gcc.gnu.org > > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > richard.guenther@gmail.com; > > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > > truncated as below: > > > > > > void test (uint16_t *x, unsigned b, unsigned n) > > > { > > > unsigned a = 0; > > > register uint16_t *p = x; > > > > > > do { > > > a = *--p; > > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > > } while (--n); > > > } > > > > > > It will have gimple before vect pass, it cannot hit any pattern of > > > SAT_SUB and then cannot vectorize to SAT_SUB. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > > > > > This patch would like to improve the pattern match to recog above > > > as truncate after .SAT_SUB pattern. Then we will have the pattern > > > similar to below, as well as eliminate the first 3 dead stmt. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > > > > I guess this is because one branch of the cond is a constant so the > > convert is folded in. I was wondering though, can't we just push > > in the truncate in this case? > > > > i.e. in this case we know both types are unsigned and the difference > > positive and max value is the max value of the truncate type. > > > > It seems like folding as a general rule > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Into > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); > > iftmp.0_13 = _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Is valid (though might have missed something). This would negate the need for > > this change to the vectorizer and saturation detection > > but also should generate better vector code. This is what we do in the general > case > > https://godbolt.org/z/dfoj6fWdv > > I think here we're just not seeing through the cond. > > > > Typically lots of architectures have cheap truncation operations, so truncating > > before saturation means you do the cheap > > operation first rather than doing the complex operation on the wider type. > > > > That is, > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > > > is cheaper than > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > after vectorization. Normally the vectorizer will try to do this through over- > > widening detection as well, > > but we haven't taught ranger about the ranges of these new IFNs (probably > > should at some point). > > > > Cheers, > > Tamar > > > > > The below tests are passed for this patch. > > > 1. The rv64gcv fully regression tests. > > > 2. The rv64gcv build with glibc. > > > 3. The x86 bootstrap tests. > > > 4. The x86 fully regression tests. > > > > > > gcc/ChangeLog: > > > > > > * match.pd: Add convert description for minus and capture. > > > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > > > new logic to handle in_type is incompatibile with out_type, as > > > well as rename from. > > > (vect_recog_build_binary_gimple_stmt): Rename to. > > > (vect_recog_sat_add_pattern): Leverage above renamed func. > > > (vect_recog_sat_sub_pattern): Ditto. > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > --- > > > gcc/match.pd | 4 +-- > > > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > > > 2 files changed, 33 insertions(+), 22 deletions(-) > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 3d0689c9312..4a4b0b2e72f 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > /* Unsigned saturation sub, case 2 (branch with ge): > > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > > (match (unsigned_integer_sat_sub @0 @1) > > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > > > integer_zerop) > > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > - && types_match (type, @0, @1)))) > > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > > > > > /* Unsigned saturation sub, case 3 (branchless with gt): > > > SAT_U_SUB = (X - Y) * (X > Y). */ > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > > index cef901808eb..3d887d36050 100644 > > > --- a/gcc/tree-vect-patterns.cc > > > +++ b/gcc/tree-vect-patterns.cc > > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > > > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > > > > > -static gcall * > > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > > > +static gimple * > > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info > > stmt_info, > > > internal_fn fn, tree *type_out, > > > - tree op_0, tree op_1) > > > + tree lhs, tree op_0, tree op_1) > > > { > > > tree itype = TREE_TYPE (op_0); > > > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree otype = TREE_TYPE (lhs); > > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > > > > > - if (vtype != NULL_TREE > > > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > > > { > > > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > > > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > > > > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > > > + gimple_call_set_lhs (call, in_ssa); > > > gimple_call_set_nothrow (call, /* nothrow_p */ false); > > > - gimple_set_location (call, gimple_location (stmt)); > > > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT > > (stmt_info))); > > > + > > > + *type_out = v_otype; > > > > > > - *type_out = vtype; > > > + if (types_compatible_p (itype, otype)) > > > + return call; > > > + else > > > + { > > > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > > > > > - return call; > > > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > > > + } > > > } > > > > > > return NULL; > > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_ADD, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_ADD, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_SUB, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_SUB, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > -- > > > 2.34.1
Got it, thanks Tamer, will have a try. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Tuesday, June 25, 2024 2:11 PM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Tuesday, June 25, 2024 7:06 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > > though it through more. > > Never mind, but you enlighten me for even more optimize with some restrictions. I > revisited the pattern, for example as below. > > uint16_t a, b; > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > => result = (char unsigned).SAT_SUB (a, b) > > If a has a def like below > uint8_t other = 0x1f; > a = (uint8_t)other You can in principle do this by querying range information, e.g. gimple_ranger ranger; int_range_max r; if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ()) { ... We do this for instance in vect_recog_divmod_pattern. Tamar > > then we can safely convert result = (char unsigned).SAT_SUB (a, b) to > result = .SAT_SUB ((char unsigned)a, (char unsigned).b) > > Then we may have better vectorized code if a is limited to char unsigned. Of course > we can do that based on this patch. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, June 25, 2024 12:01 PM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Tuesday, June 25, 2024 3:25 AM > > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Thanks Tamar for comments. It indeed benefits the vectorized code, for example > in > > RISC-V, we may eliminate some vsetvel insn in loop for widen here. > > > > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) > b_12(D)); > > > is cheaper than > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > I am not sure if it has any correctness problem for this transform, take uint16_t > to > > uint8_t as example. > > > > uint16_t a, b; > > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > > > Given a = 0x100; // 256 > > b = 0xff; // 255 > > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, > > 255) = 0 > > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, > > 255) = 1 > > > > Please help to correct me if any misunderstanding, thanks again for enlightening. > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > though it through more. > > Tamar. > > > > Pan > > > > -----Original Message----- > > From: Tamar Christina <Tamar.Christina@arm.com> > > Sent: Tuesday, June 25, 2024 4:00 AM > > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Hi, > > > > > -----Original Message----- > > > From: pan2.li@intel.com <pan2.li@intel.com> > > > Sent: Monday, June 24, 2024 2:55 PM > > > To: gcc-patches@gcc.gnu.org > > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > richard.guenther@gmail.com; > > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > > truncated as below: > > > > > > void test (uint16_t *x, unsigned b, unsigned n) > > > { > > > unsigned a = 0; > > > register uint16_t *p = x; > > > > > > do { > > > a = *--p; > > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > > } while (--n); > > > } > > > > > > It will have gimple before vect pass, it cannot hit any pattern of > > > SAT_SUB and then cannot vectorize to SAT_SUB. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > > > > > This patch would like to improve the pattern match to recog above > > > as truncate after .SAT_SUB pattern. Then we will have the pattern > > > similar to below, as well as eliminate the first 3 dead stmt. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > > > > I guess this is because one branch of the cond is a constant so the > > convert is folded in. I was wondering though, can't we just push > > in the truncate in this case? > > > > i.e. in this case we know both types are unsigned and the difference > > positive and max value is the max value of the truncate type. > > > > It seems like folding as a general rule > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Into > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); > > iftmp.0_13 = _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Is valid (though might have missed something). This would negate the need for > > this change to the vectorizer and saturation detection > > but also should generate better vector code. This is what we do in the general > case > > https://godbolt.org/z/dfoj6fWdv > > I think here we're just not seeing through the cond. > > > > Typically lots of architectures have cheap truncation operations, so truncating > > before saturation means you do the cheap > > operation first rather than doing the complex operation on the wider type. > > > > That is, > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > > > is cheaper than > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > after vectorization. Normally the vectorizer will try to do this through over- > > widening detection as well, > > but we haven't taught ranger about the ranges of these new IFNs (probably > > should at some point). > > > > Cheers, > > Tamar > > > > > The below tests are passed for this patch. > > > 1. The rv64gcv fully regression tests. > > > 2. The rv64gcv build with glibc. > > > 3. The x86 bootstrap tests. > > > 4. The x86 fully regression tests. > > > > > > gcc/ChangeLog: > > > > > > * match.pd: Add convert description for minus and capture. > > > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > > > new logic to handle in_type is incompatibile with out_type, as > > > well as rename from. > > > (vect_recog_build_binary_gimple_stmt): Rename to. > > > (vect_recog_sat_add_pattern): Leverage above renamed func. > > > (vect_recog_sat_sub_pattern): Ditto. > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > --- > > > gcc/match.pd | 4 +-- > > > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > > > 2 files changed, 33 insertions(+), 22 deletions(-) > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 3d0689c9312..4a4b0b2e72f 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > /* Unsigned saturation sub, case 2 (branch with ge): > > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > > (match (unsigned_integer_sat_sub @0 @1) > > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > > > integer_zerop) > > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > - && types_match (type, @0, @1)))) > > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > > > > > /* Unsigned saturation sub, case 3 (branchless with gt): > > > SAT_U_SUB = (X - Y) * (X > Y). */ > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > > index cef901808eb..3d887d36050 100644 > > > --- a/gcc/tree-vect-patterns.cc > > > +++ b/gcc/tree-vect-patterns.cc > > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > > > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > > > > > -static gcall * > > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > > > +static gimple * > > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info > > stmt_info, > > > internal_fn fn, tree *type_out, > > > - tree op_0, tree op_1) > > > + tree lhs, tree op_0, tree op_1) > > > { > > > tree itype = TREE_TYPE (op_0); > > > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree otype = TREE_TYPE (lhs); > > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > > > > > - if (vtype != NULL_TREE > > > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > > > { > > > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > > > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > > > > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > > > + gimple_call_set_lhs (call, in_ssa); > > > gimple_call_set_nothrow (call, /* nothrow_p */ false); > > > - gimple_set_location (call, gimple_location (stmt)); > > > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT > > (stmt_info))); > > > + > > > + *type_out = v_otype; > > > > > > - *type_out = vtype; > > > + if (types_compatible_p (itype, otype)) > > > + return call; > > > + else > > > + { > > > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > > > > > - return call; > > > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > > > + } > > > } > > > > > > return NULL; > > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_ADD, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_ADD, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_SUB, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_SUB, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > -- > > > 2.34.1
Thanks Tamer, gimple_ranger works well for that case, will send another patch after this one. Pan -----Original Message----- From: Li, Pan2 Sent: Tuesday, June 25, 2024 2:26 PM To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip Got it, thanks Tamer, will have a try. Pan -----Original Message----- From: Tamar Christina <Tamar.Christina@arm.com> Sent: Tuesday, June 25, 2024 2:11 PM To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > -----Original Message----- > From: Li, Pan2 <pan2.li@intel.com> > Sent: Tuesday, June 25, 2024 7:06 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > > though it through more. > > Never mind, but you enlighten me for even more optimize with some restrictions. I > revisited the pattern, for example as below. > > uint16_t a, b; > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > => result = (char unsigned).SAT_SUB (a, b) > > If a has a def like below > uint8_t other = 0x1f; > a = (uint8_t)other You can in principle do this by querying range information, e.g. gimple_ranger ranger; int_range_max r; if (ranger.range_of_expr (r, oprnd0, stmt) && !r.undefined_p ()) { ... We do this for instance in vect_recog_divmod_pattern. Tamar > > then we can safely convert result = (char unsigned).SAT_SUB (a, b) to > result = .SAT_SUB ((char unsigned)a, (char unsigned).b) > > Then we may have better vectorized code if a is limited to char unsigned. Of course > we can do that based on this patch. > > Pan > > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Tuesday, June 25, 2024 12:01 PM > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > -----Original Message----- > > From: Li, Pan2 <pan2.li@intel.com> > > Sent: Tuesday, June 25, 2024 3:25 AM > > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Thanks Tamar for comments. It indeed benefits the vectorized code, for example > in > > RISC-V, we may eliminate some vsetvel insn in loop for widen here. > > > > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) > b_12(D)); > > > is cheaper than > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > I am not sure if it has any correctness problem for this transform, take uint16_t > to > > uint8_t as example. > > > > uint16_t a, b; > > uint8_t result = (uint8_t)(a >= b ? a - b : 0); > > > > Given a = 0x100; // 256 > > b = 0xff; // 255 > > For iftmp.0_5 = .SAT_SUB ((char unsigned) a, (char unsigned) b) = .SAT_SUB (0, > > 255) = 0 > > For iftmp.0_5 = (char unsigned).SAT_SUB (a, b) = (char unsigned).SAT_SUB (256, > > 255) = 1 > > > > Please help to correct me if any misunderstanding, thanks again for enlightening. > > Ah, no you're right, those would end up wrong for saturation. Arg.. Sorry should > have > though it through more. > > Tamar. > > > > Pan > > > > -----Original Message----- > > From: Tamar Christina <Tamar.Christina@arm.com> > > Sent: Tuesday, June 25, 2024 4:00 AM > > To: Li, Pan2 <pan2.li@intel.com>; gcc-patches@gcc.gnu.org > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; > > jeffreyalaw@gmail.com; pinskia@gmail.com > > Subject: RE: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > Hi, > > > > > -----Original Message----- > > > From: pan2.li@intel.com <pan2.li@intel.com> > > > Sent: Monday, June 24, 2024 2:55 PM > > > To: gcc-patches@gcc.gnu.org > > > Cc: juzhe.zhong@rivai.ai; kito.cheng@gmail.com; > richard.guenther@gmail.com; > > > jeffreyalaw@gmail.com; pinskia@gmail.com; Pan Li <pan2.li@intel.com> > > > Subject: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > > > > > From: Pan Li <pan2.li@intel.com> > > > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > > truncated as below: > > > > > > void test (uint16_t *x, unsigned b, unsigned n) > > > { > > > unsigned a = 0; > > > register uint16_t *p = x; > > > > > > do { > > > a = *--p; > > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > > } while (--n); > > > } > > > > > > It will have gimple before vect pass, it cannot hit any pattern of > > > SAT_SUB and then cannot vectorize to SAT_SUB. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > > > > > This patch would like to improve the pattern match to recog above > > > as truncate after .SAT_SUB pattern. Then we will have the pattern > > > similar to below, as well as eliminate the first 3 dead stmt. > > > > > > _2 = a_11 - b_12(D); > > > iftmp.0_13 = (short unsigned int) _2; > > > _18 = a_11 >= b_12(D); > > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > > > > I guess this is because one branch of the cond is a constant so the > > convert is folded in. I was wondering though, can't we just push > > in the truncate in this case? > > > > i.e. in this case we know both types are unsigned and the difference > > positive and max value is the max value of the truncate type. > > > > It seems like folding as a general rule > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Into > > > > _1 = *p_10; > > a_11 = (unsigned int) _1; > > _2 = ((short unsigned int) a_11) - ((short unsigned int) b_12(D)); > > iftmp.0_13 = _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > *p_10 = iftmp.0_5; > > > > Is valid (though might have missed something). This would negate the need for > > this change to the vectorizer and saturation detection > > but also should generate better vector code. This is what we do in the general > case > > https://godbolt.org/z/dfoj6fWdv > > I think here we're just not seeing through the cond. > > > > Typically lots of architectures have cheap truncation operations, so truncating > > before saturation means you do the cheap > > operation first rather than doing the complex operation on the wider type. > > > > That is, > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = .SAT_SUB ((short unsigned int) a_11, (short unsigned int) b_12(D)); > > > > is cheaper than > > > > _2 = a_11 - b_12(D); > > iftmp.0_13 = (short unsigned int) _2; > > _18 = a_11 >= b_12(D); > > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > > > after vectorization. Normally the vectorizer will try to do this through over- > > widening detection as well, > > but we haven't taught ranger about the ranges of these new IFNs (probably > > should at some point). > > > > Cheers, > > Tamar > > > > > The below tests are passed for this patch. > > > 1. The rv64gcv fully regression tests. > > > 2. The rv64gcv build with glibc. > > > 3. The x86 bootstrap tests. > > > 4. The x86 fully regression tests. > > > > > > gcc/ChangeLog: > > > > > > * match.pd: Add convert description for minus and capture. > > > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > > > new logic to handle in_type is incompatibile with out_type, as > > > well as rename from. > > > (vect_recog_build_binary_gimple_stmt): Rename to. > > > (vect_recog_sat_add_pattern): Leverage above renamed func. > > > (vect_recog_sat_sub_pattern): Ditto. > > > > > > Signed-off-by: Pan Li <pan2.li@intel.com> > > > --- > > > gcc/match.pd | 4 +-- > > > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > > > 2 files changed, 33 insertions(+), 22 deletions(-) > > > > > > diff --git a/gcc/match.pd b/gcc/match.pd > > > index 3d0689c9312..4a4b0b2e72f 100644 > > > --- a/gcc/match.pd > > > +++ b/gcc/match.pd > > > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > > > /* Unsigned saturation sub, case 2 (branch with ge): > > > SAT_U_SUB = X >= Y ? X - Y : 0. */ > > > (match (unsigned_integer_sat_sub @0 @1) > > > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > > > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) > > > integer_zerop) > > > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > > > - && types_match (type, @0, @1)))) > > > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > > > > > /* Unsigned saturation sub, case 3 (branchless with gt): > > > SAT_U_SUB = (X - Y) * (X > Y). */ > > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > > > index cef901808eb..3d887d36050 100644 > > > --- a/gcc/tree-vect-patterns.cc > > > +++ b/gcc/tree-vect-patterns.cc > > > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > > > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > > > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > > > > > -static gcall * > > > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > > > +static gimple * > > > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info > > stmt_info, > > > internal_fn fn, tree *type_out, > > > - tree op_0, tree op_1) > > > + tree lhs, tree op_0, tree op_1) > > > { > > > tree itype = TREE_TYPE (op_0); > > > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree otype = TREE_TYPE (lhs); > > > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > > > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > > > > > - if (vtype != NULL_TREE > > > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > > > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > > > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > > > { > > > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > > > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > > > > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > > > + gimple_call_set_lhs (call, in_ssa); > > > gimple_call_set_nothrow (call, /* nothrow_p */ false); > > > - gimple_set_location (call, gimple_location (stmt)); > > > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT > > (stmt_info))); > > > + > > > + *type_out = v_otype; > > > > > > - *type_out = vtype; > > > + if (types_compatible_p (itype, otype)) > > > + return call; > > > + else > > > + { > > > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > > > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > > > > > - return call; > > > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > > > + } > > > } > > > > > > return NULL; > > > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_ADD, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_ADD, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, > > > stmt_vec_info stmt_vinfo, > > > > > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > > > { > > > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > > > - IFN_SAT_SUB, type_out, > > > - ops[0], ops[1]); > > > - if (call) > > > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > > > + IFN_SAT_SUB, type_out, > > > + lhs, ops[0], ops[1]); > > > + if (stmt) > > > { > > > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > > > - return call; > > > + return stmt; > > > } > > > } > > > > > > -- > > > 2.34.1
On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) I suppose the other patterns can see similar enhacements for the case their forms show up truncated or extended? > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); Please use NOP_EXPR here. > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1 >
> I suppose the other patterns can see similar enhacements for the case > their forms > show up truncated or extended? Yes, just want to highlight that this form comes from the zip benchmark. Of course, the rest forms are planed in underlying Patch(es). > Please use NOP_EXPR here. Sure, and will send the v2 if no surprise from test. Pan -----Original Message----- From: Richard Biener <richard.guenther@gmail.com> Sent: Wednesday, June 26, 2024 9:56 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) I suppose the other patterns can see similar enhacements for the case their forms show up truncated or extended? > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); Please use NOP_EXPR here. > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1 >
On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. I have tried this patch with x86_64 on the testcase from PR51492, but the compiler does not recognize the .SAT_SUB pattern here. Is there anything else missing for successful detection? Uros. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1 >
It only requires the backend implement the standard name for vector mode I bet. How about a simpler one like below. #define DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(OUT_T, IN_T) \ void __attribute__((noinline)) \ vec_sat_u_sub_trunc_##OUT_T##_fmt_1 (OUT_T *out, IN_T *op_1, IN_T y, \ unsigned limit) \ { \ unsigned i; \ for (i = 0; i < limit; i++) \ { \ IN_T x = op_1[i]; \ out[i] = (OUT_T)(x >= y ? x - y : 0); \ } \ } DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint32_t, uint64_t); The riscv backend is able to detect the pattern similar as below. I can help to check x86 side after the running test suites. ;; basic block 2, loop depth 0 ;; pred: ENTRY if (limit_11(D) != 0) goto <bb 3>; [89.00%] else goto <bb 5>; [11.00%] ;; succ: 3 ;; 5 ;; basic block 3, loop depth 0 ;; pred: 2 vect_cst__71 = [vec_duplicate_expr] y_14(D); _78 = (unsigned long) limit_11(D); ;; succ: 4 ;; basic block 4, loop depth 1 ;; pred: 4 ;; 3 # vectp_op_1.7_68 = PHI <vectp_op_1.7_69(4), op_1_12(D)(3)> # vectp_out.12_75 = PHI <vectp_out.12_76(4), out_16(D)(3)> # ivtmp_79 = PHI <ivtmp_80(4), _78(3)> _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [2, 2]); ivtmp_67 = _81 * 8; vect_x_13.9_70 = .MASK_LEN_LOAD (vectp_op_1.7_68, 64B, { -1, ... }, _81, 0); vect_patt_48.10_72 = .SAT_SUB (vect_x_13.9_70, vect_cst__71); // .SAT_SUB pattern vect_patt_49.11_73 = (vector([2,2]) unsigned int) vect_patt_48.10_72; ivtmp_74 = _81 * 4; .MASK_LEN_STORE (vectp_out.12_75, 32B, { -1, ... }, _81, 0, vect_patt_49.11_73); vectp_op_1.7_69 = vectp_op_1.7_68 + ivtmp_67; vectp_out.12_76 = vectp_out.12_75 + ivtmp_74; ivtmp_80 = ivtmp_79 - _81; riscv64-unknown-elf-gcc (GCC) 15.0.0 20240627 (experimental) Copyright (C) 2024 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. Pan -----Original Message----- From: Uros Bizjak <ubizjak@gmail.com> Sent: Thursday, June 27, 2024 2:48 PM To: Li, Pan2 <pan2.li@intel.com> Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote: > > From: Pan Li <pan2.li@intel.com> > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > truncated as below: > > void test (uint16_t *x, unsigned b, unsigned n) > { > unsigned a = 0; > register uint16_t *p = x; > > do { > a = *--p; > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > } while (--n); > } > > It will have gimple before vect pass, it cannot hit any pattern of > SAT_SUB and then cannot vectorize to SAT_SUB. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = _18 ? iftmp.0_13 : 0; > > This patch would like to improve the pattern match to recog above > as truncate after .SAT_SUB pattern. Then we will have the pattern > similar to below, as well as eliminate the first 3 dead stmt. > > _2 = a_11 - b_12(D); > iftmp.0_13 = (short unsigned int) _2; > _18 = a_11 >= b_12(D); > iftmp.0_5 = (short unsigned int).SAT_SUB (a_11, b_12(D)); > > The below tests are passed for this patch. > 1. The rv64gcv fully regression tests. > 2. The rv64gcv build with glibc. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. I have tried this patch with x86_64 on the testcase from PR51492, but the compiler does not recognize the .SAT_SUB pattern here. Is there anything else missing for successful detection? Uros. > > gcc/ChangeLog: > > * match.pd: Add convert description for minus and capture. > * tree-vect-patterns.cc (vect_recog_build_binary_gimple_call): Add > new logic to handle in_type is incompatibile with out_type, as > well as rename from. > (vect_recog_build_binary_gimple_stmt): Rename to. > (vect_recog_sat_add_pattern): Leverage above renamed func. > (vect_recog_sat_sub_pattern): Ditto. > > Signed-off-by: Pan Li <pan2.li@intel.com> > --- > gcc/match.pd | 4 +-- > gcc/tree-vect-patterns.cc | 51 ++++++++++++++++++++++++--------------- > 2 files changed, 33 insertions(+), 22 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 3d0689c9312..4a4b0b2e72f 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned saturation sub, case 2 (branch with ge): > SAT_U_SUB = X >= Y ? X - Y : 0. */ > (match (unsigned_integer_sat_sub @0 @1) > - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) > + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop) > (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) > - && types_match (type, @0, @1)))) > + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) > > /* Unsigned saturation sub, case 3 (branchless with gt): > SAT_U_SUB = (X - Y) * (X > Y). */ > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index cef901808eb..3d887d36050 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, > extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); > > -static gcall * > -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, > +static gimple * > +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, > internal_fn fn, tree *type_out, > - tree op_0, tree op_1) > + tree lhs, tree op_0, tree op_1) > { > tree itype = TREE_TYPE (op_0); > - tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + tree otype = TREE_TYPE (lhs); > + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); > + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); > > - if (vtype != NULL_TREE > - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) > + if (v_itype != NULL_TREE && v_otype != NULL_TREE > + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) > { > gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); > + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); > > - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); > + gimple_call_set_lhs (call, in_ssa); > gimple_call_set_nothrow (call, /* nothrow_p */ false); > - gimple_set_location (call, gimple_location (stmt)); > + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); > + > + *type_out = v_otype; > > - *type_out = vtype; > + if (types_compatible_p (itype, otype)) > + return call; > + else > + { > + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); > + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); > > - return call; > + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); > + } > } > > return NULL; > @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_ADD, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_ADD, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); > - return call; > + return stmt; > } > } > > @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > > if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) > { > - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, > - IFN_SAT_SUB, type_out, > - ops[0], ops[1]); > - if (call) > + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, > + IFN_SAT_SUB, type_out, > + lhs, ops[0], ops[1]); > + if (stmt) > { > vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); > - return call; > + return stmt; > } > } > > -- > 2.34.1 >
On Thu, Jun 27, 2024 at 9:01 AM Li, Pan2 <pan2.li@intel.com> wrote: > > It only requires the backend implement the standard name for vector mode I bet. There are several standard names present for x86: {ss,us}{add,sub}{v8qi,v16qi,v32qi,v64qi,v4hi,v8hi,v16hi,v32hi}, defined in sse.md: (define_expand "<insn><mode>3<mask_name>" [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand") (sat_plusminus:VI12_AVX2_AVX512BW (match_operand:VI12_AVX2_AVX512BW 1 "vector_operand") (match_operand:VI12_AVX2_AVX512BW 2 "vector_operand")))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") but all of these handle only 8 and 16 bit elements. > How about a simpler one like below. > > #define DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(OUT_T, IN_T) \ > void __attribute__((noinline)) \ > vec_sat_u_sub_trunc_##OUT_T##_fmt_1 (OUT_T *out, IN_T *op_1, IN_T y, \ > unsigned limit) \ > { \ > unsigned i; \ > for (i = 0; i < limit; i++) \ > { \ > IN_T x = op_1[i]; \ > out[i] = (OUT_T)(x >= y ? x - y : 0); \ > } \ > } > > DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint32_t, uint64_t); I tried with: DEF_VEC_SAT_U_SUB_TRUNC_FMT_1(uint8_t, uint16_t); And the compiler was able to detect several .SAT_SUB patterns: $ grep SAT_SUB pr51492-1.c.266t.optimized vect_patt_37.14_85 = .SAT_SUB (vect_x_13.12_81, vect_cst__84); vect_patt_37.14_86 = .SAT_SUB (vect_x_13.13_83, vect_cst__84); vect_patt_42.26_126 = .SAT_SUB (vect_x_62.24_122, vect_cst__125); vect_patt_42.26_127 = .SAT_SUB (vect_x_62.25_124, vect_cst__125); iftmp.0_24 = .SAT_SUB (x_3, y_14(D)); Uros. > > The riscv backend is able to detect the pattern similar as below. I can help to check x86 side after the running test suites. > > ;; basic block 2, loop depth 0 > ;; pred: ENTRY > if (limit_11(D) != 0) > goto <bb 3>; [89.00%] > else > goto <bb 5>; [11.00%] > ;; succ: 3 > ;; 5 > ;; basic block 3, loop depth 0 > ;; pred: 2 > vect_cst__71 = [vec_duplicate_expr] y_14(D); > _78 = (unsigned long) limit_11(D); > ;; succ: 4 > > ;; basic block 4, loop depth 1 > ;; pred: 4 > ;; 3 > # vectp_op_1.7_68 = PHI <vectp_op_1.7_69(4), op_1_12(D)(3)> > # vectp_out.12_75 = PHI <vectp_out.12_76(4), out_16(D)(3)> > # ivtmp_79 = PHI <ivtmp_80(4), _78(3)> > _81 = .SELECT_VL (ivtmp_79, POLY_INT_CST [2, 2]); > ivtmp_67 = _81 * 8; > vect_x_13.9_70 = .MASK_LEN_LOAD (vectp_op_1.7_68, 64B, { -1, ... }, _81, 0); > vect_patt_48.10_72 = .SAT_SUB (vect_x_13.9_70, vect_cst__71); // .SAT_SUB pattern > vect_patt_49.11_73 = (vector([2,2]) unsigned int) vect_patt_48.10_72; > ivtmp_74 = _81 * 4; > .MASK_LEN_STORE (vectp_out.12_75, 32B, { -1, ... }, _81, 0, vect_patt_49.11_73); > vectp_op_1.7_69 = vectp_op_1.7_68 + ivtmp_67; > vectp_out.12_76 = vectp_out.12_75 + ivtmp_74; > ivtmp_80 = ivtmp_79 - _81; > > riscv64-unknown-elf-gcc (GCC) 15.0.0 20240627 (experimental) > Copyright (C) 2024 Free Software Foundation, Inc. > This is free software; see the source for copying conditions. There is NO > warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. > > Pan > > -----Original Message----- > From: Uros Bizjak <ubizjak@gmail.com> > Sent: Thursday, June 27, 2024 2:48 PM > To: Li, Pan2 <pan2.li@intel.com> > Cc: gcc-patches@gcc.gnu.org; juzhe.zhong@rivai.ai; kito.cheng@gmail.com; richard.guenther@gmail.com; jeffreyalaw@gmail.com; pinskia@gmail.com > Subject: Re: [PATCH v2] Vect: Support truncate after .SAT_SUB pattern in zip > > On Mon, Jun 24, 2024 at 3:55 PM <pan2.li@intel.com> wrote: > > > > From: Pan Li <pan2.li@intel.com> > > > > The zip benchmark of coremark-pro have one SAT_SUB like pattern but > > truncated as below: > > > > void test (uint16_t *x, unsigned b, unsigned n) > > { > > unsigned a = 0; > > register uint16_t *p = x; > > > > do { > > a = *--p; > > *p = (uint16_t)(a >= b ? a - b : 0); // Truncate after .SAT_SUB > > } while (--n); > > } > > No, the current compiler does not recognize .SAT_SUB for x86 with the above code, although many vector sat sub instructions involving 16bit elements are present. Uros.
diff --git a/gcc/match.pd b/gcc/match.pd index 3d0689c9312..4a4b0b2e72f 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3164,9 +3164,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* Unsigned saturation sub, case 2 (branch with ge): SAT_U_SUB = X >= Y ? X - Y : 0. */ (match (unsigned_integer_sat_sub @0 @1) - (cond^ (ge @0 @1) (minus @0 @1) integer_zerop) + (cond^ (ge @0 @1) (convert? (minus (convert1? @0) (convert1? @1))) integer_zerop) (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type) - && types_match (type, @0, @1)))) + && TYPE_UNSIGNED (TREE_TYPE (@0)) && types_match (@0, @1)))) /* Unsigned saturation sub, case 3 (branchless with gt): SAT_U_SUB = (X - Y) * (X > Y). */ diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index cef901808eb..3d887d36050 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -4490,26 +4490,37 @@ vect_recog_mult_pattern (vec_info *vinfo, extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); extern bool gimple_unsigned_integer_sat_sub (tree, tree*, tree (*)(tree)); -static gcall * -vect_recog_build_binary_gimple_call (vec_info *vinfo, gimple *stmt, +static gimple * +vect_recog_build_binary_gimple_stmt (vec_info *vinfo, stmt_vec_info stmt_info, internal_fn fn, tree *type_out, - tree op_0, tree op_1) + tree lhs, tree op_0, tree op_1) { tree itype = TREE_TYPE (op_0); - tree vtype = get_vectype_for_scalar_type (vinfo, itype); + tree otype = TREE_TYPE (lhs); + tree v_itype = get_vectype_for_scalar_type (vinfo, itype); + tree v_otype = get_vectype_for_scalar_type (vinfo, otype); - if (vtype != NULL_TREE - && direct_internal_fn_supported_p (fn, vtype, OPTIMIZE_FOR_BOTH)) + if (v_itype != NULL_TREE && v_otype != NULL_TREE + && direct_internal_fn_supported_p (fn, v_itype, OPTIMIZE_FOR_BOTH)) { gcall *call = gimple_build_call_internal (fn, 2, op_0, op_1); + tree in_ssa = vect_recog_temp_ssa_var (itype, NULL); - gimple_call_set_lhs (call, vect_recog_temp_ssa_var (itype, NULL)); + gimple_call_set_lhs (call, in_ssa); gimple_call_set_nothrow (call, /* nothrow_p */ false); - gimple_set_location (call, gimple_location (stmt)); + gimple_set_location (call, gimple_location (STMT_VINFO_STMT (stmt_info))); + + *type_out = v_otype; - *type_out = vtype; + if (types_compatible_p (itype, otype)) + return call; + else + { + append_pattern_def_seq (vinfo, stmt_info, call, v_itype); + tree out_ssa = vect_recog_temp_ssa_var (otype, NULL); - return call; + return gimple_build_assign (out_ssa, CONVERT_EXPR, in_ssa); + } } return NULL; @@ -4541,13 +4552,13 @@ vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, if (gimple_unsigned_integer_sat_add (lhs, ops, NULL)) { - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, - IFN_SAT_ADD, type_out, - ops[0], ops[1]); - if (call) + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, + IFN_SAT_ADD, type_out, + lhs, ops[0], ops[1]); + if (stmt) { vect_pattern_detected ("vect_recog_sat_add_pattern", last_stmt); - return call; + return stmt; } } @@ -4579,13 +4590,13 @@ vect_recog_sat_sub_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, if (gimple_unsigned_integer_sat_sub (lhs, ops, NULL)) { - gcall *call = vect_recog_build_binary_gimple_call (vinfo, last_stmt, - IFN_SAT_SUB, type_out, - ops[0], ops[1]); - if (call) + gimple *stmt = vect_recog_build_binary_gimple_stmt (vinfo, stmt_vinfo, + IFN_SAT_SUB, type_out, + lhs, ops[0], ops[1]); + if (stmt) { vect_pattern_detected ("vect_recog_sat_sub_pattern", last_stmt); - return call; + return stmt; } }