Message ID | 20231011122715.3017753-1-juzhe.zhong@rivai.ai |
---|---|
State | New |
Headers | show |
Series | VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] | expand |
On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > This patch fixes this following FAILs in RISC-V regression: > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > situation as GATHER_LOAD (no conditional mask). > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > gcc/ChangeLog: > > * tree-vect-slp.cc (vect_get_operand_map): > (vect_build_slp_tree_1): > (vect_build_slp_tree_2): > * tree-vect-stmts.cc (vectorizable_load): > > --- > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > gcc/tree-vect-stmts.cc | 4 ++-- > 2 files changed, 18 insertions(+), 4 deletions(-) > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index fa098f9ff4e..712c04ec278 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > case IFN_MASK_GATHER_LOAD: > return arg1_arg4_map; > > + case IFN_MASK_LEN_GATHER_LOAD: > + /* In tree-vect-patterns.cc, we will have these 2 situations: > + > + - Unconditional gather load transforms > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > + > + - Conditional gather load transforms > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > + : nullptr; > + > case IFN_MASK_STORE: > return arg3_arg2_map; > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > if (cfn == CFN_MASK_LOAD > || cfn == CFN_GATHER_LOAD > - || cfn == CFN_MASK_GATHER_LOAD) > + || cfn == CFN_MASK_GATHER_LOAD > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > ldst_p = true; > else if (cfn == CFN_MASK_STORE) > { > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > && rhs_code != CFN_GATHER_LOAD > && rhs_code != CFN_MASK_GATHER_LOAD > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > /* Not grouped loads are handled as externals for BB > vectorization. For loop vectorization we can handle > splats the same we handle single element interleaving. */ > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > else > { > *max_nunits = this_max_nunits; > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index cd7c1090d88..263acf5d3cd 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > return false; > > mask_index = internal_fn_mask_index (ifn); > - if (mask_index >= 0 && slp_node) > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > mask_index = vect_slp_child_index_for_operand (call, mask_index); > - if (mask_index >= 0 > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > return false; You are ignoring the mask argument and here only handle it when the IFN doesn't have a _LEN. This doesn't seem to be forward looking to the point where you want to actually handle masked (aka conditional) gather. Did you check that SLP is actually used to vectorize this? Richard.
Thanks Richi point it out. I found this patch can't make conditional gather load succeed on SLP. I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. Is it reasonable ? juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-11 20:50 To: Juzhe-Zhong CC: gcc-patches; richard.sandiford Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > This patch fixes this following FAILs in RISC-V regression: > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > situation as GATHER_LOAD (no conditional mask). > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > gcc/ChangeLog: > > * tree-vect-slp.cc (vect_get_operand_map): > (vect_build_slp_tree_1): > (vect_build_slp_tree_2): > * tree-vect-stmts.cc (vectorizable_load): > > --- > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > gcc/tree-vect-stmts.cc | 4 ++-- > 2 files changed, 18 insertions(+), 4 deletions(-) > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index fa098f9ff4e..712c04ec278 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > case IFN_MASK_GATHER_LOAD: > return arg1_arg4_map; > > + case IFN_MASK_LEN_GATHER_LOAD: > + /* In tree-vect-patterns.cc, we will have these 2 situations: > + > + - Unconditional gather load transforms > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > + > + - Conditional gather load transforms > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > + : nullptr; > + > case IFN_MASK_STORE: > return arg3_arg2_map; > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > if (cfn == CFN_MASK_LOAD > || cfn == CFN_GATHER_LOAD > - || cfn == CFN_MASK_GATHER_LOAD) > + || cfn == CFN_MASK_GATHER_LOAD > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > ldst_p = true; > else if (cfn == CFN_MASK_STORE) > { > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > && rhs_code != CFN_GATHER_LOAD > && rhs_code != CFN_MASK_GATHER_LOAD > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > /* Not grouped loads are handled as externals for BB > vectorization. For loop vectorization we can handle > splats the same we handle single element interleaving. */ > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > else > { > *max_nunits = this_max_nunits; > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index cd7c1090d88..263acf5d3cd 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > return false; > > mask_index = internal_fn_mask_index (ifn); > - if (mask_index >= 0 && slp_node) > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > mask_index = vect_slp_child_index_for_operand (call, mask_index); > - if (mask_index >= 0 > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > return false; You are ignoring the mask argument and here only handle it when the IFN doesn't have a _LEN. This doesn't seem to be forward looking to the point where you want to actually handle masked (aka conditional) gather. Did you check that SLP is actually used to vectorize this? Richard.
On Thu, 12 Oct 2023, ??? wrote: > Thanks Richi point it out. > > I found this patch can't make conditional gather load succeed on SLP. > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > Is it reasonable ? What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments even when the mask is -1? > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-11 20:50 > To: Juzhe-Zhong > CC: gcc-patches; richard.sandiford > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > This patch fixes this following FAILs in RISC-V regression: > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > situation as GATHER_LOAD (no conditional mask). > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > gcc/ChangeLog: > > > > * tree-vect-slp.cc (vect_get_operand_map): > > (vect_build_slp_tree_1): > > (vect_build_slp_tree_2): > > * tree-vect-stmts.cc (vectorizable_load): > > > > --- > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > gcc/tree-vect-stmts.cc | 4 ++-- > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > index fa098f9ff4e..712c04ec278 100644 > > --- a/gcc/tree-vect-slp.cc > > +++ b/gcc/tree-vect-slp.cc > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > case IFN_MASK_GATHER_LOAD: > > return arg1_arg4_map; > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > + > > + - Unconditional gather load transforms > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > + > > + - Conditional gather load transforms > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > + : nullptr; > > + > > case IFN_MASK_STORE: > > return arg3_arg2_map; > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (cfn == CFN_MASK_LOAD > > || cfn == CFN_GATHER_LOAD > > - || cfn == CFN_MASK_GATHER_LOAD) > > + || cfn == CFN_MASK_GATHER_LOAD > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > ldst_p = true; > > else if (cfn == CFN_MASK_STORE) > > { > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > && rhs_code != CFN_GATHER_LOAD > > && rhs_code != CFN_MASK_GATHER_LOAD > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > /* Not grouped loads are handled as externals for BB > > vectorization. For loop vectorization we can handle > > splats the same we handle single element interleaving. */ > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > else > > { > > *max_nunits = this_max_nunits; > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index cd7c1090d88..263acf5d3cd 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > return false; > > > > mask_index = internal_fn_mask_index (ifn); > > - if (mask_index >= 0 && slp_node) > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > - if (mask_index >= 0 > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > &mask, NULL, &mask_dt, &mask_vectype)) > > return false; > > You are ignoring the mask argument and here only handle it when the > IFN doesn't have a _LEN. This doesn't seem to be forward looking > to the point where you want to actually handle masked (aka conditional) > gather. > > Did you check that SLP is actually used to vectorize this? > > Richard. > >
I tree-vect-slp.cc: vect_get_and_check_slp_defs 711: tree type = TREE_TYPE (oprnd); dt = dts[i]; if ((dt == vect_constant_def || dt == vect_external_def) && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () && (TREE_CODE (type) == BOOLEAN_TYPE || !can_duplicate_and_interleave_p (vinfo, stmts.length (), type))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: invalid type of def " "for variable-length SLP %T\n", oprnd); return -1; } Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: Build SLP failed: invalid type of def juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 17:44 To: 钟居哲 CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, ??? wrote: > Thanks Richi point it out. > > I found this patch can't make conditional gather load succeed on SLP. > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > Is it reasonable ? What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments even when the mask is -1? > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-11 20:50 > To: Juzhe-Zhong > CC: gcc-patches; richard.sandiford > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > This patch fixes this following FAILs in RISC-V regression: > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > situation as GATHER_LOAD (no conditional mask). > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > gcc/ChangeLog: > > > > * tree-vect-slp.cc (vect_get_operand_map): > > (vect_build_slp_tree_1): > > (vect_build_slp_tree_2): > > * tree-vect-stmts.cc (vectorizable_load): > > > > --- > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > gcc/tree-vect-stmts.cc | 4 ++-- > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > index fa098f9ff4e..712c04ec278 100644 > > --- a/gcc/tree-vect-slp.cc > > +++ b/gcc/tree-vect-slp.cc > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > case IFN_MASK_GATHER_LOAD: > > return arg1_arg4_map; > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > + > > + - Unconditional gather load transforms > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > + > > + - Conditional gather load transforms > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > + : nullptr; > > + > > case IFN_MASK_STORE: > > return arg3_arg2_map; > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (cfn == CFN_MASK_LOAD > > || cfn == CFN_GATHER_LOAD > > - || cfn == CFN_MASK_GATHER_LOAD) > > + || cfn == CFN_MASK_GATHER_LOAD > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > ldst_p = true; > > else if (cfn == CFN_MASK_STORE) > > { > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > && rhs_code != CFN_GATHER_LOAD > > && rhs_code != CFN_MASK_GATHER_LOAD > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > /* Not grouped loads are handled as externals for BB > > vectorization. For loop vectorization we can handle > > splats the same we handle single element interleaving. */ > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > else > > { > > *max_nunits = this_max_nunits; > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index cd7c1090d88..263acf5d3cd 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > return false; > > > > mask_index = internal_fn_mask_index (ifn); > > - if (mask_index >= 0 && slp_node) > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > - if (mask_index >= 0 > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > &mask, NULL, &mask_dt, &mask_vectype)) > > return false; > > You are ignoring the mask argument and here only handle it when the > IFN doesn't have a _LEN. This doesn't seem to be forward looking > to the point where you want to actually handle masked (aka conditional) > gather. > > Did you check that SLP is actually used to vectorize this? > > Richard. > >
On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > I tree-vect-slp.cc: > vect_get_and_check_slp_defs > 711: > > tree type = TREE_TYPE (oprnd); > dt = dts[i]; > if ((dt == vect_constant_def > || dt == vect_external_def) > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > && (TREE_CODE (type) == BOOLEAN_TYPE > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > type))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "Build SLP failed: invalid type of def " > "for variable-length SLP %T\n", oprnd); > return -1; > } > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > Build SLP failed: invalid type of def I think this can be restricted to vect_external_def, but some history might reveal the cases we put this code in for (we should be able to materialize all constants?). At least uniform boolean constants should be fine. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:44 > To: ??? > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, ??? wrote: > > > Thanks Richi point it out. > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > Is it reasonable ? > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > even when the mask is -1? > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-11 20:50 > > To: Juzhe-Zhong > > CC: gcc-patches; richard.sandiford > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > situation as GATHER_LOAD (no conditional mask). > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > gcc/ChangeLog: > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > (vect_build_slp_tree_1): > > > (vect_build_slp_tree_2): > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > --- > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > index fa098f9ff4e..712c04ec278 100644 > > > --- a/gcc/tree-vect-slp.cc > > > +++ b/gcc/tree-vect-slp.cc > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > case IFN_MASK_GATHER_LOAD: > > > return arg1_arg4_map; > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > + > > > + - Unconditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > + > > > + - Conditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > + : nullptr; > > > + > > > case IFN_MASK_STORE: > > > return arg3_arg2_map; > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > if (cfn == CFN_MASK_LOAD > > > || cfn == CFN_GATHER_LOAD > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > + || cfn == CFN_MASK_GATHER_LOAD > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > ldst_p = true; > > > else if (cfn == CFN_MASK_STORE) > > > { > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > && rhs_code != CFN_GATHER_LOAD > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > /* Not grouped loads are handled as externals for BB > > > vectorization. For loop vectorization we can handle > > > splats the same we handle single element interleaving. */ > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > else > > > { > > > *max_nunits = this_max_nunits; > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index cd7c1090d88..263acf5d3cd 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > return false; > > > > > > mask_index = internal_fn_mask_index (ifn); > > > - if (mask_index >= 0 && slp_node) > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > - if (mask_index >= 0 > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > return false; > > > > You are ignoring the mask argument and here only handle it when the > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > to the point where you want to actually handle masked (aka conditional) > > gather. > > > > Did you check that SLP is actually used to vectorize this? > > > > Richard. > > > > > >
Hi, Richi. I restrict as you said into vect_external_def. Then this condition made SLP failed: - if (mask_index >= 0 + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, &mask, NULL, &mask_dt, &mask_vectype)) return false; So I add 'internal_fn_len_index (ifn) < 0' for MASK_LEN_GATHER_LOAD does not check scalar mask. Then ICE here: vect_slp_analyze_node_operations if (child && (SLP_TREE_DEF_TYPE (child) == vect_constant_def || SLP_TREE_DEF_TYPE (child) == vect_external_def) /* Perform usual caching, note code-generation still code-gens these nodes multiple times but we expect to CSE them later. */ && !visited_set.add (child)) { visited_vec.safe_push (child); /* ??? After auditing more code paths make a "default" and push the vector type from NODE to all children if it is not already set. */ /* Compute the number of vectors to be generated. */ tree vector_type = SLP_TREE_VECTYPE (child); if (!vector_type) { /* For shifts with a scalar argument we don't need to cost or code-generate anything. ??? Represent this more explicitely. */ gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) ----> assert FAILed. == shift_vec_info_type) && j == 1); continue; } Could you help me with that? juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 17:55 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > I tree-vect-slp.cc: > vect_get_and_check_slp_defs > 711: > > tree type = TREE_TYPE (oprnd); > dt = dts[i]; > if ((dt == vect_constant_def > || dt == vect_external_def) > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > && (TREE_CODE (type) == BOOLEAN_TYPE > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > type))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "Build SLP failed: invalid type of def " > "for variable-length SLP %T\n", oprnd); > return -1; > } > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > Build SLP failed: invalid type of def I think this can be restricted to vect_external_def, but some history might reveal the cases we put this code in for (we should be able to materialize all constants?). At least uniform boolean constants should be fine. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:44 > To: ??? > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, ??? wrote: > > > Thanks Richi point it out. > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > Is it reasonable ? > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > even when the mask is -1? > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-11 20:50 > > To: Juzhe-Zhong > > CC: gcc-patches; richard.sandiford > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > situation as GATHER_LOAD (no conditional mask). > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > gcc/ChangeLog: > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > (vect_build_slp_tree_1): > > > (vect_build_slp_tree_2): > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > --- > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > index fa098f9ff4e..712c04ec278 100644 > > > --- a/gcc/tree-vect-slp.cc > > > +++ b/gcc/tree-vect-slp.cc > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > case IFN_MASK_GATHER_LOAD: > > > return arg1_arg4_map; > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > + > > > + - Unconditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > + > > > + - Conditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > + : nullptr; > > > + > > > case IFN_MASK_STORE: > > > return arg3_arg2_map; > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > if (cfn == CFN_MASK_LOAD > > > || cfn == CFN_GATHER_LOAD > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > + || cfn == CFN_MASK_GATHER_LOAD > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > ldst_p = true; > > > else if (cfn == CFN_MASK_STORE) > > > { > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > && rhs_code != CFN_GATHER_LOAD > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > /* Not grouped loads are handled as externals for BB > > > vectorization. For loop vectorization we can handle > > > splats the same we handle single element interleaving. */ > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > else > > > { > > > *max_nunits = this_max_nunits; > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index cd7c1090d88..263acf5d3cd 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > return false; > > > > > > mask_index = internal_fn_mask_index (ifn); > > > - if (mask_index >= 0 && slp_node) > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > - if (mask_index >= 0 > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > return false; > > > > You are ignoring the mask argument and here only handle it when the > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > to the point where you want to actually handle masked (aka conditional) > > gather. > > > > Did you check that SLP is actually used to vectorize this? > > > > Richard. > > > > > >
Oh. I see. Here make vect_constant_def failed to SLP: tree-vect-slp.cc: vect_build_slp_tree_2 line 2354: if (oprnd_info->first_dt == vect_external_def || oprnd_info->first_dt == vect_constant_def) { slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; oprnd_info->ops = vNULL; children.safe_push (invnode); continue; } It seems that we handle vect_constant_def same as vect_external_def. So failed to SLP ? juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 17:55 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > I tree-vect-slp.cc: > vect_get_and_check_slp_defs > 711: > > tree type = TREE_TYPE (oprnd); > dt = dts[i]; > if ((dt == vect_constant_def > || dt == vect_external_def) > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > && (TREE_CODE (type) == BOOLEAN_TYPE > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > type))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "Build SLP failed: invalid type of def " > "for variable-length SLP %T\n", oprnd); > return -1; > } > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > Build SLP failed: invalid type of def I think this can be restricted to vect_external_def, but some history might reveal the cases we put this code in for (we should be able to materialize all constants?). At least uniform boolean constants should be fine. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:44 > To: ??? > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, ??? wrote: > > > Thanks Richi point it out. > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > Is it reasonable ? > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > even when the mask is -1? > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-11 20:50 > > To: Juzhe-Zhong > > CC: gcc-patches; richard.sandiford > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > situation as GATHER_LOAD (no conditional mask). > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > gcc/ChangeLog: > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > (vect_build_slp_tree_1): > > > (vect_build_slp_tree_2): > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > --- > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > index fa098f9ff4e..712c04ec278 100644 > > > --- a/gcc/tree-vect-slp.cc > > > +++ b/gcc/tree-vect-slp.cc > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > case IFN_MASK_GATHER_LOAD: > > > return arg1_arg4_map; > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > + > > > + - Unconditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > + > > > + - Conditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > + : nullptr; > > > + > > > case IFN_MASK_STORE: > > > return arg3_arg2_map; > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > if (cfn == CFN_MASK_LOAD > > > || cfn == CFN_GATHER_LOAD > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > + || cfn == CFN_MASK_GATHER_LOAD > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > ldst_p = true; > > > else if (cfn == CFN_MASK_STORE) > > > { > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > && rhs_code != CFN_GATHER_LOAD > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > /* Not grouped loads are handled as externals for BB > > > vectorization. For loop vectorization we can handle > > > splats the same we handle single element interleaving. */ > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > else > > > { > > > *max_nunits = this_max_nunits; > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index cd7c1090d88..263acf5d3cd 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > return false; > > > > > > mask_index = internal_fn_mask_index (ifn); > > > - if (mask_index >= 0 && slp_node) > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > - if (mask_index >= 0 > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > return false; > > > > You are ignoring the mask argument and here only handle it when the > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > to the point where you want to actually handle masked (aka conditional) > > gather. > > > > Did you check that SLP is actually used to vectorize this? > > > > Richard. > > > > > >
In tree-vect-stmts.cc vect_check_scalar_mask Failed here: /* If the caller is not prepared for adjusting an external/constant SLP mask vector type fail. */ if (slp_node && !mask_node && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "SLP mask argument is not vectorized.\n"); return false; } If we allow vect_constant_def, we should adjust constant SLP mask ? in the caller "vectorizable_load" ? But I don't know how to adjust that. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 17:55 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > I tree-vect-slp.cc: > vect_get_and_check_slp_defs > 711: > > tree type = TREE_TYPE (oprnd); > dt = dts[i]; > if ((dt == vect_constant_def > || dt == vect_external_def) > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > && (TREE_CODE (type) == BOOLEAN_TYPE > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > type))) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "Build SLP failed: invalid type of def " > "for variable-length SLP %T\n", oprnd); > return -1; > } > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > Build SLP failed: invalid type of def I think this can be restricted to vect_external_def, but some history might reveal the cases we put this code in for (we should be able to materialize all constants?). At least uniform boolean constants should be fine. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:44 > To: ??? > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, ??? wrote: > > > Thanks Richi point it out. > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > Is it reasonable ? > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > even when the mask is -1? > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-11 20:50 > > To: Juzhe-Zhong > > CC: gcc-patches; richard.sandiford > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > situation as GATHER_LOAD (no conditional mask). > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > gcc/ChangeLog: > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > (vect_build_slp_tree_1): > > > (vect_build_slp_tree_2): > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > --- > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > index fa098f9ff4e..712c04ec278 100644 > > > --- a/gcc/tree-vect-slp.cc > > > +++ b/gcc/tree-vect-slp.cc > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > case IFN_MASK_GATHER_LOAD: > > > return arg1_arg4_map; > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > + > > > + - Unconditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > + > > > + - Conditional gather load transforms > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > + : nullptr; > > > + > > > case IFN_MASK_STORE: > > > return arg3_arg2_map; > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > if (cfn == CFN_MASK_LOAD > > > || cfn == CFN_GATHER_LOAD > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > + || cfn == CFN_MASK_GATHER_LOAD > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > ldst_p = true; > > > else if (cfn == CFN_MASK_STORE) > > > { > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > && rhs_code != CFN_GATHER_LOAD > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > /* Not grouped loads are handled as externals for BB > > > vectorization. For loop vectorization we can handle > > > splats the same we handle single element interleaving. */ > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > else > > > { > > > *max_nunits = this_max_nunits; > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index cd7c1090d88..263acf5d3cd 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > return false; > > > > > > mask_index = internal_fn_mask_index (ifn); > > > - if (mask_index >= 0 && slp_node) > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > - if (mask_index >= 0 > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > return false; > > > > You are ignoring the mask argument and here only handle it when the > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > to the point where you want to actually handle masked (aka conditional) > > gather. > > > > Did you check that SLP is actually used to vectorize this? > > > > Richard. > > > > > >
On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > Hi, Richi. > > I restrict as you said into vect_external_def. > > Then this condition made SLP failed: > > - if (mask_index >= 0 > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > &mask, NULL, &mask_dt, &mask_vectype)) > return false; > > So I add 'internal_fn_len_index (ifn) < 0' for MASK_LEN_GATHER_LOAD does not check scalar mask. Rather figure why. > Then ICE here: > > vect_slp_analyze_node_operations > if (child > && (SLP_TREE_DEF_TYPE (child) == vect_constant_def > || SLP_TREE_DEF_TYPE (child) == vect_external_def) > /* Perform usual caching, note code-generation still > code-gens these nodes multiple times but we expect > to CSE them later. */ > && !visited_set.add (child)) > { > visited_vec.safe_push (child); > /* ??? After auditing more code paths make a "default" > and push the vector type from NODE to all children > if it is not already set. */ > /* Compute the number of vectors to be generated. */ > tree vector_type = SLP_TREE_VECTYPE (child); > if (!vector_type) > { > /* For shifts with a scalar argument we don't need > to cost or code-generate anything. > ??? Represent this more explicitely. */ > gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) ----> assert FAILed. > == shift_vec_info_type) > && j == 1); > continue; > } > > Could you help me with that? > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:55 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > > > I tree-vect-slp.cc: > > vect_get_and_check_slp_defs > > 711: > > > > tree type = TREE_TYPE (oprnd); > > dt = dts[i]; > > if ((dt == vect_constant_def > > || dt == vect_external_def) > > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > > && (TREE_CODE (type) == BOOLEAN_TYPE > > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > > type))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "Build SLP failed: invalid type of def " > > "for variable-length SLP %T\n", oprnd); > > return -1; > > } > > > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > > Build SLP failed: invalid type of def > > I think this can be restricted to vect_external_def, but some history > might reveal the cases we put this code in for (we should be able to > materialize all constants?). At least uniform boolean constants > should be fine. > > > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-12 17:44 > > To: ??? > > CC: gcc-patches; richard.sandiford > > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Thu, 12 Oct 2023, ??? wrote: > > > > > Thanks Richi point it out. > > > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > > > Is it reasonable ? > > > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > > even when the mask is -1? > > > > > > > > juzhe.zhong@rivai.ai > > > > > > From: Richard Biener > > > Date: 2023-10-11 20:50 > > > To: Juzhe-Zhong > > > CC: gcc-patches; richard.sandiford > > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > > situation as GATHER_LOAD (no conditional mask). > > > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > > (vect_build_slp_tree_1): > > > > (vect_build_slp_tree_2): > > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > > > --- > > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > > index fa098f9ff4e..712c04ec278 100644 > > > > --- a/gcc/tree-vect-slp.cc > > > > +++ b/gcc/tree-vect-slp.cc > > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > > case IFN_MASK_GATHER_LOAD: > > > > return arg1_arg4_map; > > > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > > + > > > > + - Unconditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > > + > > > > + - Conditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > > + : nullptr; > > > > + > > > > case IFN_MASK_STORE: > > > > return arg3_arg2_map; > > > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > > > if (cfn == CFN_MASK_LOAD > > > > || cfn == CFN_GATHER_LOAD > > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > > + || cfn == CFN_MASK_GATHER_LOAD > > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > > ldst_p = true; > > > > else if (cfn == CFN_MASK_STORE) > > > > { > > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > > && rhs_code != CFN_GATHER_LOAD > > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > > /* Not grouped loads are handled as externals for BB > > > > vectorization. For loop vectorization we can handle > > > > splats the same we handle single element interleaving. */ > > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > > else > > > > { > > > > *max_nunits = this_max_nunits; > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index cd7c1090d88..263acf5d3cd 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > > return false; > > > > > > > > mask_index = internal_fn_mask_index (ifn); > > > > - if (mask_index >= 0 && slp_node) > > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > > - if (mask_index >= 0 > > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > > return false; > > > > > > You are ignoring the mask argument and here only handle it when the > > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > > to the point where you want to actually handle masked (aka conditional) > > > gather. > > > > > > Did you check that SLP is actually used to vectorize this? > > > > > > Richard. > > > > > > > > > > > >
On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > Oh. I see. > > Here make vect_constant_def failed to SLP: > > tree-vect-slp.cc: > vect_build_slp_tree_2 > line 2354: > > if (oprnd_info->first_dt == vect_external_def > || oprnd_info->first_dt == vect_constant_def) > { > slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); > SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; > oprnd_info->ops = vNULL; > children.safe_push (invnode); > continue; > } > > It seems that we handle vect_constant_def same as vect_external_def. > So failed to SLP ? Why? We _should_ see a SLP node for the all-true mask operand. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:55 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > > > I tree-vect-slp.cc: > > vect_get_and_check_slp_defs > > 711: > > > > tree type = TREE_TYPE (oprnd); > > dt = dts[i]; > > if ((dt == vect_constant_def > > || dt == vect_external_def) > > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > > && (TREE_CODE (type) == BOOLEAN_TYPE > > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > > type))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "Build SLP failed: invalid type of def " > > "for variable-length SLP %T\n", oprnd); > > return -1; > > } > > > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > > Build SLP failed: invalid type of def > > I think this can be restricted to vect_external_def, but some history > might reveal the cases we put this code in for (we should be able to > materialize all constants?). At least uniform boolean constants > should be fine. > > > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-12 17:44 > > To: ??? > > CC: gcc-patches; richard.sandiford > > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Thu, 12 Oct 2023, ??? wrote: > > > > > Thanks Richi point it out. > > > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > > > Is it reasonable ? > > > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > > even when the mask is -1? > > > > > > > > juzhe.zhong@rivai.ai > > > > > > From: Richard Biener > > > Date: 2023-10-11 20:50 > > > To: Juzhe-Zhong > > > CC: gcc-patches; richard.sandiford > > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > > situation as GATHER_LOAD (no conditional mask). > > > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > > (vect_build_slp_tree_1): > > > > (vect_build_slp_tree_2): > > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > > > --- > > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > > index fa098f9ff4e..712c04ec278 100644 > > > > --- a/gcc/tree-vect-slp.cc > > > > +++ b/gcc/tree-vect-slp.cc > > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > > case IFN_MASK_GATHER_LOAD: > > > > return arg1_arg4_map; > > > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > > + > > > > + - Unconditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > > + > > > > + - Conditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > > + : nullptr; > > > > + > > > > case IFN_MASK_STORE: > > > > return arg3_arg2_map; > > > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > > > if (cfn == CFN_MASK_LOAD > > > > || cfn == CFN_GATHER_LOAD > > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > > + || cfn == CFN_MASK_GATHER_LOAD > > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > > ldst_p = true; > > > > else if (cfn == CFN_MASK_STORE) > > > > { > > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > > && rhs_code != CFN_GATHER_LOAD > > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > > /* Not grouped loads are handled as externals for BB > > > > vectorization. For loop vectorization we can handle > > > > splats the same we handle single element interleaving. */ > > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > > else > > > > { > > > > *max_nunits = this_max_nunits; > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index cd7c1090d88..263acf5d3cd 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > > return false; > > > > > > > > mask_index = internal_fn_mask_index (ifn); > > > > - if (mask_index >= 0 && slp_node) > > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > > - if (mask_index >= 0 > > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > > return false; > > > > > > You are ignoring the mask argument and here only handle it when the > > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > > to the point where you want to actually handle masked (aka conditional) > > > gather. > > > > > > Did you check that SLP is actually used to vectorize this? > > > > > > Richard. > > > > > > > > > > > >
On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > In tree-vect-stmts.cc > > vect_check_scalar_mask > > Failed here: > > /* If the caller is not prepared for adjusting an external/constant > SLP mask vector type fail. */ > if (slp_node > && !mask_node ^^^ where's the mask_node? > && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "SLP mask argument is not vectorized.\n"); > return false; > } > > If we allow vect_constant_def, we should adjust constant SLP mask ? in the caller "vectorizable_load" ? > > But I don't know how to adjust that. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:55 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > > > I tree-vect-slp.cc: > > vect_get_and_check_slp_defs > > 711: > > > > tree type = TREE_TYPE (oprnd); > > dt = dts[i]; > > if ((dt == vect_constant_def > > || dt == vect_external_def) > > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > > && (TREE_CODE (type) == BOOLEAN_TYPE > > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > > type))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "Build SLP failed: invalid type of def " > > "for variable-length SLP %T\n", oprnd); > > return -1; > > } > > > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > > Build SLP failed: invalid type of def > > I think this can be restricted to vect_external_def, but some history > might reveal the cases we put this code in for (we should be able to > materialize all constants?). At least uniform boolean constants > should be fine. > > > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-12 17:44 > > To: ??? > > CC: gcc-patches; richard.sandiford > > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Thu, 12 Oct 2023, ??? wrote: > > > > > Thanks Richi point it out. > > > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > > > Is it reasonable ? > > > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > > even when the mask is -1? > > > > > > > > juzhe.zhong@rivai.ai > > > > > > From: Richard Biener > > > Date: 2023-10-11 20:50 > > > To: Juzhe-Zhong > > > CC: gcc-patches; richard.sandiford > > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > > situation as GATHER_LOAD (no conditional mask). > > > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > > (vect_build_slp_tree_1): > > > > (vect_build_slp_tree_2): > > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > > > --- > > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > > index fa098f9ff4e..712c04ec278 100644 > > > > --- a/gcc/tree-vect-slp.cc > > > > +++ b/gcc/tree-vect-slp.cc > > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > > case IFN_MASK_GATHER_LOAD: > > > > return arg1_arg4_map; > > > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > > + > > > > + - Unconditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > > + > > > > + - Conditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > > + : nullptr; > > > > + > > > > case IFN_MASK_STORE: > > > > return arg3_arg2_map; > > > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > > > if (cfn == CFN_MASK_LOAD > > > > || cfn == CFN_GATHER_LOAD > > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > > + || cfn == CFN_MASK_GATHER_LOAD > > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > > ldst_p = true; > > > > else if (cfn == CFN_MASK_STORE) > > > > { > > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > > && rhs_code != CFN_GATHER_LOAD > > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > > /* Not grouped loads are handled as externals for BB > > > > vectorization. For loop vectorization we can handle > > > > splats the same we handle single element interleaving. */ > > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > > else > > > > { > > > > *max_nunits = this_max_nunits; > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index cd7c1090d88..263acf5d3cd 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > > return false; > > > > > > > > mask_index = internal_fn_mask_index (ifn); > > > > - if (mask_index >= 0 && slp_node) > > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > > - if (mask_index >= 0 > > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > > return false; > > > > > > You are ignoring the mask argument and here only handle it when the > > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > > to the point where you want to actually handle masked (aka conditional) > > > gather. > > > > > > Did you check that SLP is actually used to vectorize this? > > > > > > Richard. > > > > > > > > > > > >
The mask node is NULL since the caller : if (mask_index >= 0 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, &mask, NULL, &mask_dt, &mask_vectype)) return false; pass NULL to mask_node. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 19:14 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > In tree-vect-stmts.cc > > vect_check_scalar_mask > > Failed here: > > /* If the caller is not prepared for adjusting an external/constant > SLP mask vector type fail. */ > if (slp_node > && !mask_node ^^^ where's the mask_node? > && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "SLP mask argument is not vectorized.\n"); > return false; > } > > If we allow vect_constant_def, we should adjust constant SLP mask ? in the caller "vectorizable_load" ? > > But I don't know how to adjust that. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:55 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > > > I tree-vect-slp.cc: > > vect_get_and_check_slp_defs > > 711: > > > > tree type = TREE_TYPE (oprnd); > > dt = dts[i]; > > if ((dt == vect_constant_def > > || dt == vect_external_def) > > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > > && (TREE_CODE (type) == BOOLEAN_TYPE > > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > > type))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "Build SLP failed: invalid type of def " > > "for variable-length SLP %T\n", oprnd); > > return -1; > > } > > > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > > Build SLP failed: invalid type of def > > I think this can be restricted to vect_external_def, but some history > might reveal the cases we put this code in for (we should be able to > materialize all constants?). At least uniform boolean constants > should be fine. > > > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-12 17:44 > > To: ??? > > CC: gcc-patches; richard.sandiford > > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Thu, 12 Oct 2023, ??? wrote: > > > > > Thanks Richi point it out. > > > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > > > Is it reasonable ? > > > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > > even when the mask is -1? > > > > > > > > juzhe.zhong@rivai.ai > > > > > > From: Richard Biener > > > Date: 2023-10-11 20:50 > > > To: Juzhe-Zhong > > > CC: gcc-patches; richard.sandiford > > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > > situation as GATHER_LOAD (no conditional mask). > > > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > > (vect_build_slp_tree_1): > > > > (vect_build_slp_tree_2): > > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > > > --- > > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > > index fa098f9ff4e..712c04ec278 100644 > > > > --- a/gcc/tree-vect-slp.cc > > > > +++ b/gcc/tree-vect-slp.cc > > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > > case IFN_MASK_GATHER_LOAD: > > > > return arg1_arg4_map; > > > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > > + > > > > + - Unconditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > > + > > > > + - Conditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > > + : nullptr; > > > > + > > > > case IFN_MASK_STORE: > > > > return arg3_arg2_map; > > > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > > > if (cfn == CFN_MASK_LOAD > > > > || cfn == CFN_GATHER_LOAD > > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > > + || cfn == CFN_MASK_GATHER_LOAD > > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > > ldst_p = true; > > > > else if (cfn == CFN_MASK_STORE) > > > > { > > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > > && rhs_code != CFN_GATHER_LOAD > > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > > /* Not grouped loads are handled as externals for BB > > > > vectorization. For loop vectorization we can handle > > > > splats the same we handle single element interleaving. */ > > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > > else > > > > { > > > > *max_nunits = this_max_nunits; > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index cd7c1090d88..263acf5d3cd 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > > return false; > > > > > > > > mask_index = internal_fn_mask_index (ifn); > > > > - if (mask_index >= 0 && slp_node) > > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > > - if (mask_index >= 0 > > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > > return false; > > > > > > You are ignoring the mask argument and here only handle it when the > > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > > to the point where you want to actually handle masked (aka conditional) > > > gather. > > > > > > Did you check that SLP is actually used to vectorize this? > > > > > > Richard. > > > > > > > > > > > >
Hi, Richi. As you suggest, I keep MAK_LEN_GATHER_LOAD (...,-1) format and support SLP for that in V3: https://gcc.gnu.org/pipermail/gcc-patches/2023-October/632846.html Thanks. juzhe.zhong@rivai.ai From: Richard Biener Date: 2023-10-12 19:14 To: juzhe.zhong@rivai.ai CC: gcc-patches; richard.sandiford Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > In tree-vect-stmts.cc > > vect_check_scalar_mask > > Failed here: > > /* If the caller is not prepared for adjusting an external/constant > SLP mask vector type fail. */ > if (slp_node > && !mask_node ^^^ where's the mask_node? > && SLP_TREE_DEF_TYPE (mask_node_1) != vect_internal_def) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "SLP mask argument is not vectorized.\n"); > return false; > } > > If we allow vect_constant_def, we should adjust constant SLP mask ? in the caller "vectorizable_load" ? > > But I don't know how to adjust that. > > > > juzhe.zhong@rivai.ai > > From: Richard Biener > Date: 2023-10-12 17:55 > To: juzhe.zhong@rivai.ai > CC: gcc-patches; richard.sandiford > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > On Thu, 12 Oct 2023, juzhe.zhong@rivai.ai wrote: > > > I tree-vect-slp.cc: > > vect_get_and_check_slp_defs > > 711: > > > > tree type = TREE_TYPE (oprnd); > > dt = dts[i]; > > if ((dt == vect_constant_def > > || dt == vect_external_def) > > && !GET_MODE_SIZE (vinfo->vector_mode).is_constant () > > && (TREE_CODE (type) == BOOLEAN_TYPE > > || !can_duplicate_and_interleave_p (vinfo, stmts.length (), > > type))) > > { > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "Build SLP failed: invalid type of def " > > "for variable-length SLP %T\n", oprnd); > > return -1; > > } > > > > Here mask = -1 is BOOLEAN type in tree-vect-patterns.cc reaching this condition, then SLP failed: > > Build SLP failed: invalid type of def > > I think this can be restricted to vect_external_def, but some history > might reveal the cases we put this code in for (we should be able to > materialize all constants?). At least uniform boolean constants > should be fine. > > > > > > > > juzhe.zhong@rivai.ai > > > > From: Richard Biener > > Date: 2023-10-12 17:44 > > To: ??? > > CC: gcc-patches; richard.sandiford > > Subject: Re: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > On Thu, 12 Oct 2023, ??? wrote: > > > > > Thanks Richi point it out. > > > > > > I found this patch can't make conditional gather load succeed on SLP. > > > > > > I am considering change MASK_LEN_GATHER_LOAD in pattern recognization: > > > > > > If no condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0) -> 4 arguments same as GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the GATHER_LOAD SLP flow naturally. > > > > > > If has condition mask, in tree-vect-patterns.cc, I build MASK_LEN_GATHER_LOAD (ptr, offset, scale, 0, condition) -> 5 arguments same as MASK_GATHER_LOAD. > > > In this situation, MASK_LEN_GATHER_LOAD can resue the MASK_GATHER_LOAD SLP flow naturally. > > > > > > Is it reasonable ? > > > > What's wrong with handling MASK_LEN_GATHER_LOAD with all arguments > > even when the mask is -1? > > > > > > > > juzhe.zhong@rivai.ai > > > > > > From: Richard Biener > > > Date: 2023-10-11 20:50 > > > To: Juzhe-Zhong > > > CC: gcc-patches; richard.sandiford > > > Subject: Re: [PATCH] VECT: Enhance SLP of MASK_LEN_GATHER_LOAD[PR111721] > > > On Wed, 11 Oct 2023, Juzhe-Zhong wrote: > > > > > > > This patch fixes this following FAILs in RISC-V regression: > > > > > > > > FAIL: gcc.dg/vect/vect-gather-1.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-1.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c -flto -ffat-lto-objects scan-tree-dump vect "Loop contains only SLP stmts" > > > > FAIL: gcc.dg/vect/vect-gather-3.c scan-tree-dump vect "Loop contains only SLP stmts" > > > > > > > > The root cause of these FAIL is that GCC SLP failed on MASK_LEN_GATHER_LOAD. > > > > > > > > Since for RVV, we build MASK_LEN_GATHER_LOAD with dummy mask (-1) in tree-vect-patterns.cc if it is same > > > > situation as GATHER_LOAD (no conditional mask). > > > > > > > > So we make MASK_LEN_GATHER_LOAD leverage the flow of GATHER_LOAD if mask argument is a dummy mask. > > > > > > > > gcc/ChangeLog: > > > > > > > > * tree-vect-slp.cc (vect_get_operand_map): > > > > (vect_build_slp_tree_1): > > > > (vect_build_slp_tree_2): > > > > * tree-vect-stmts.cc (vectorizable_load): > > > > > > > > --- > > > > gcc/tree-vect-slp.cc | 18 ++++++++++++++++-- > > > > gcc/tree-vect-stmts.cc | 4 ++-- > > > > 2 files changed, 18 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > > > index fa098f9ff4e..712c04ec278 100644 > > > > --- a/gcc/tree-vect-slp.cc > > > > +++ b/gcc/tree-vect-slp.cc > > > > @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) > > > > case IFN_MASK_GATHER_LOAD: > > > > return arg1_arg4_map; > > > > > > > > + case IFN_MASK_LEN_GATHER_LOAD: > > > > + /* In tree-vect-patterns.cc, we will have these 2 situations: > > > > + > > > > + - Unconditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. > > > > + > > > > + - Conditional gather load transforms > > > > + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ > > > > + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map > > > > + : nullptr; > > > > + > > > > case IFN_MASK_STORE: > > > > return arg3_arg2_map; > > > > > > > > @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > > > > > if (cfn == CFN_MASK_LOAD > > > > || cfn == CFN_GATHER_LOAD > > > > - || cfn == CFN_MASK_GATHER_LOAD) > > > > + || cfn == CFN_MASK_GATHER_LOAD > > > > + || cfn == CFN_MASK_LEN_GATHER_LOAD) > > > > ldst_p = true; > > > > else if (cfn == CFN_MASK_STORE) > > > > { > > > > @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, > > > > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > > > > && rhs_code != CFN_GATHER_LOAD > > > > && rhs_code != CFN_MASK_GATHER_LOAD > > > > + && rhs_code != CFN_MASK_LEN_GATHER_LOAD > > > > /* Not grouped loads are handled as externals for BB > > > > vectorization. For loop vectorization we can handle > > > > splats the same we handle single element interleaving. */ > > > > @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > > > > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > > > > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > > > > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > > > > - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > > > > + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) > > > > + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); > > > > else > > > > { > > > > *max_nunits = this_max_nunits; > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > > index cd7c1090d88..263acf5d3cd 100644 > > > > --- a/gcc/tree-vect-stmts.cc > > > > +++ b/gcc/tree-vect-stmts.cc > > > > @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, > > > > return false; > > > > > > > > mask_index = internal_fn_mask_index (ifn); > > > > - if (mask_index >= 0 && slp_node) > > > > + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) > > > > mask_index = vect_slp_child_index_for_operand (call, mask_index); > > > > - if (mask_index >= 0 > > > > + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 > > > > && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, > > > > &mask, NULL, &mask_dt, &mask_vectype)) > > > > return false; > > > > > > You are ignoring the mask argument and here only handle it when the > > > IFN doesn't have a _LEN. This doesn't seem to be forward looking > > > to the point where you want to actually handle masked (aka conditional) > > > gather. > > > > > > Did you check that SLP is actually used to vectorize this? > > > > > > Richard. > > > > > > > > > > > >
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index fa098f9ff4e..712c04ec278 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -544,6 +544,17 @@ vect_get_operand_map (const gimple *stmt, unsigned char swap = 0) case IFN_MASK_GATHER_LOAD: return arg1_arg4_map; + case IFN_MASK_LEN_GATHER_LOAD: + /* In tree-vect-patterns.cc, we will have these 2 situations: + + - Unconditional gather load transforms + into MASK_LEN_GATHER_LOAD with dummy mask which is -1. + + - Conditional gather load transforms + into MASK_LEN_GATHER_LOAD with real conditional mask.*/ + return integer_minus_onep (gimple_call_arg (call, 4)) ? arg1_map + : nullptr; + case IFN_MASK_STORE: return arg3_arg2_map; @@ -1077,7 +1088,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, if (cfn == CFN_MASK_LOAD || cfn == CFN_GATHER_LOAD - || cfn == CFN_MASK_GATHER_LOAD) + || cfn == CFN_MASK_GATHER_LOAD + || cfn == CFN_MASK_LEN_GATHER_LOAD) ldst_p = true; else if (cfn == CFN_MASK_STORE) { @@ -1337,6 +1349,7 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) && rhs_code != CFN_GATHER_LOAD && rhs_code != CFN_MASK_GATHER_LOAD + && rhs_code != CFN_MASK_LEN_GATHER_LOAD /* Not grouped loads are handled as externals for BB vectorization. For loop vectorization we can handle splats the same we handle single element interleaving. */ @@ -1837,7 +1850,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) - || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); + || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD) + || gimple_call_internal_p (stmt, IFN_MASK_LEN_GATHER_LOAD)); else { *max_nunits = this_max_nunits; diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index cd7c1090d88..263acf5d3cd 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9575,9 +9575,9 @@ vectorizable_load (vec_info *vinfo, return false; mask_index = internal_fn_mask_index (ifn); - if (mask_index >= 0 && slp_node) + if (mask_index >= 0 && slp_node && internal_fn_len_index (ifn) < 0) mask_index = vect_slp_child_index_for_operand (call, mask_index); - if (mask_index >= 0 + if (mask_index >= 0 && internal_fn_len_index (ifn) < 0 && !vect_check_scalar_mask (vinfo, stmt_info, slp_node, mask_index, &mask, NULL, &mask_dt, &mask_vectype)) return false;