Message ID | 20210713214956.2010942-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | Rewrite memset expanders with vec_duplicate | expand |
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > scratch register to avoid stack realignment when expanding memset. > > PR middle-end/90773 > * builtins.c (gen_memset_value_from_prev): New function. > (gen_memset_broadcast): Likewise. > (builtin_memset_read_str): Use gen_memset_value_from_prev > and gen_memset_broadcast. > (builtin_memset_gen_str): Likewise. > * target.def (gen_memset_scratch_rtx): New hook. > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > * doc/tm.texi: Regenerated. > --- > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > gcc/doc/tm.texi | 5 ++ > gcc/doc/tm.texi.in | 2 + > gcc/target.def | 7 +++ > 4 files changed, 116 insertions(+), 21 deletions(-) > > diff --git a/gcc/builtins.c b/gcc/builtins.c > index 39ab139b7e1..c1758ae2efc 100644 > --- a/gcc/builtins.c > +++ b/gcc/builtins.c > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > return NULL_RTX; > } > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > - bytes from constant string DATA + OFFSET and return it as target > - constant. If PREV isn't nullptr, it has the RTL info from the > +/* Return the RTL of a register in MODE generated from PREV in the > previous iteration. */ > > -rtx > -builtin_memset_read_str (void *data, void *prevp, > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > - scalar_int_mode mode) > +static rtx > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > { > + rtx target = nullptr; > by_pieces_prev *prev = (by_pieces_prev *) prevp; > if (prev != nullptr && prev->data != nullptr) > { > /* Use the previous data in the same mode. */ > if (prev->mode == mode) > return prev->data; > + > + rtx prev_rtx = prev->data; > + machine_mode prev_mode = prev->mode; > + unsigned int word_size = GET_MODE_SIZE (word_mode); > + if (word_size < GET_MODE_SIZE (prev->mode) > + && word_size > GET_MODE_SIZE (mode)) > + { > + /* First generate subreg of word mode if the previous mode is > + wider than word mode and word mode is wider than MODE. */ > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > + prev_mode, 0); > + prev_mode = word_mode; > + } > + if (prev_rtx != nullptr) > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > } > + return target; > +} > + > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > + > +static rtx > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > +{ > + /* Skip if regno_reg_rtx isn't initialized. */ > + if (!regno_reg_rtx) > + return nullptr; > + > + rtx target = nullptr; > + > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > + machine_mode vector_mode; > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > + gcc_unreachable (); Sorry, I realise it's a bit late to be raising this objection now, but I don't think it's a good idea to use scalar integer modes as a proxy for vector modes. In principle there's no reason why a target has to define an integer mode for every vector mode. If we want the mode to be a vector then I think the by-pieces infrastructure should be extended to support vectors directly, rather than assuming that each piece can be represented as a scalar_int_mode. Thanks, Richard > + > + enum insn_code icode = optab_handler (vec_duplicate_optab, > + vector_mode); > + if (icode != CODE_FOR_nothing) > + { > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); > + if (CONST_INT_P (data)) > + { > + /* Use the move expander with CONST_VECTOR. */ > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); > + emit_move_insn (reg, const_vec); > + } > + else > + { > + > + class expand_operand ops[2]; > + create_output_operand (&ops[0], reg, vector_mode); > + create_input_operand (&ops[1], data, QImode); > + expand_insn (icode, 2, ops); > + if (!rtx_equal_p (reg, ops[0].value)) > + emit_move_insn (reg, ops[0].value); > + } > + target = lowpart_subreg (mode, reg, vector_mode); > + } > + > + return target; > +} > + > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > + bytes from constant string DATA + OFFSET and return it as target > + constant. If PREV isn't nullptr, it has the RTL info from the > + previous iteration. */ > > +rtx > +builtin_memset_read_str (void *data, void *prev, > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > + scalar_int_mode mode) > +{ > + rtx target; > const char *c = (const char *) data; > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > + char *p; > + > + /* Don't use the previous value if size is 1. */ > + if (GET_MODE_SIZE (mode) != 1) > + { > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > + memset (p, *c, GET_MODE_SIZE (QImode)); > + rtx src = c_readstr (p, QImode); > + target = gen_memset_broadcast (src, mode); > + if (target != nullptr) > + return target; > + } > + > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > memset (p, *c, GET_MODE_SIZE (mode)); > > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp, > nullptr, it has the RTL info from the previous iteration. */ > > static rtx > -builtin_memset_gen_str (void *data, void *prevp, > +builtin_memset_gen_str (void *data, void *prev, > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > scalar_int_mode mode) > { > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp, > size_t size; > char *p; > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > - if (prev != nullptr && prev->data != nullptr) > - { > - /* Use the previous data in the same mode. */ > - if (prev->mode == mode) > - return prev->data; > - > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > - if (target != nullptr) > - return target; > - } > - > size = GET_MODE_SIZE (mode); > if (size == 1) > return (rtx) data; > > + target = gen_memset_value_from_prev (prev, mode); > + if (target != nullptr) > + return target; > + > + target = gen_memset_broadcast ((rtx) data, mode); > + if (target != nullptr) > + return target; > + > p = XALLOCAVEC (char, size); > memset (p, 1, size); > coeff = c_readstr (p, mode); > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > index 2a41ae5fba1..8849711fcf8 100644 > --- a/gcc/doc/tm.texi > +++ b/gcc/doc/tm.texi > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. > @end deftypefn > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) > +This hook should return an rtx for scratch register in @var{mode} to > +be used by memset broadcast. The default is @code{gen_reg_rtx}. > +@end deftypefn > + > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) > This target hook returns a new value for the number of times @var{loop} > should be unrolled. The parameter @var{nunroll} is the number of times > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > index f881cdabe9e..a6bbf4f2667 100644 > --- a/gcc/doc/tm.texi.in > +++ b/gcc/doc/tm.texi.in > @@ -7958,6 +7958,8 @@ lists. > > @hook TARGET_GEN_CCMP_NEXT > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > + > @hook TARGET_LOOP_UNROLL_ADJUST > > @defmac POWI_MAX_MULTS > diff --git a/gcc/target.def b/gcc/target.def > index c009671c583..5fb287db3bd 100644 > --- a/gcc/target.def > +++ b/gcc/target.def > @@ -2726,6 +2726,13 @@ DEFHOOK > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), > NULL) > > +DEFHOOK > +(gen_memset_scratch_rtx, > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > + rtx, (machine_mode mode), > + gen_reg_rtx) > + > /* Return a new value for loop unroll size. */ > DEFHOOK > (loop_unroll_adjust,
On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > > scratch register to avoid stack realignment when expanding memset. > > > > PR middle-end/90773 > > * builtins.c (gen_memset_value_from_prev): New function. > > (gen_memset_broadcast): Likewise. > > (builtin_memset_read_str): Use gen_memset_value_from_prev > > and gen_memset_broadcast. > > (builtin_memset_gen_str): Likewise. > > * target.def (gen_memset_scratch_rtx): New hook. > > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > > * doc/tm.texi: Regenerated. > > --- > > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > > gcc/doc/tm.texi | 5 ++ > > gcc/doc/tm.texi.in | 2 + > > gcc/target.def | 7 +++ > > 4 files changed, 116 insertions(+), 21 deletions(-) > > > > diff --git a/gcc/builtins.c b/gcc/builtins.c > > index 39ab139b7e1..c1758ae2efc 100644 > > --- a/gcc/builtins.c > > +++ b/gcc/builtins.c > > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > > return NULL_RTX; > > } > > > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > - bytes from constant string DATA + OFFSET and return it as target > > - constant. If PREV isn't nullptr, it has the RTL info from the > > +/* Return the RTL of a register in MODE generated from PREV in the > > previous iteration. */ > > > > -rtx > > -builtin_memset_read_str (void *data, void *prevp, > > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > - scalar_int_mode mode) > > +static rtx > > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > > { > > + rtx target = nullptr; > > by_pieces_prev *prev = (by_pieces_prev *) prevp; > > if (prev != nullptr && prev->data != nullptr) > > { > > /* Use the previous data in the same mode. */ > > if (prev->mode == mode) > > return prev->data; > > + > > + rtx prev_rtx = prev->data; > > + machine_mode prev_mode = prev->mode; > > + unsigned int word_size = GET_MODE_SIZE (word_mode); > > + if (word_size < GET_MODE_SIZE (prev->mode) > > + && word_size > GET_MODE_SIZE (mode)) > > + { > > + /* First generate subreg of word mode if the previous mode is > > + wider than word mode and word mode is wider than MODE. */ > > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > > + prev_mode, 0); > > + prev_mode = word_mode; > > + } > > + if (prev_rtx != nullptr) > > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > > } > > + return target; > > +} > > + > > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > > + > > +static rtx > > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > > +{ > > + /* Skip if regno_reg_rtx isn't initialized. */ > > + if (!regno_reg_rtx) > > + return nullptr; > > + > > + rtx target = nullptr; > > + > > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > > + machine_mode vector_mode; > > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > > + gcc_unreachable (); > > Sorry, I realise it's a bit late to be raising this objection now, > but I don't think it's a good idea to use scalar integer modes as > a proxy for vector modes. In principle there's no reason why a > target has to define an integer mode for every vector mode. A target always defines the largest integer mode. > If we want the mode to be a vector then I think the by-pieces > infrastructure should be extended to support vectors directly, > rather than assuming that each piece can be represented as > a scalar_int_mode. > The current by-pieces infrastructure operates on scalar_int_mode. Only for memset, there is /* Callback routine for store_by_pieces. Return the RTL of a register containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned char value given in the RTL register data. For example, if mode is 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't nullptr, it has the RTL info from the previous iteration. */ static rtx builtin_memset_gen_str (void *data, void *prevp, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, scalar_int_mode mode) It is a broadcast. If a target can broadcast a byte to a wider integer, can you suggest a way to use it in the current by-pieces infrastructure? Thanks. > Thanks, > Richard > > > + > > + enum insn_code icode = optab_handler (vec_duplicate_optab, > > + vector_mode); > > + if (icode != CODE_FOR_nothing) > > + { > > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); > > + if (CONST_INT_P (data)) > > + { > > + /* Use the move expander with CONST_VECTOR. */ > > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); > > + emit_move_insn (reg, const_vec); > > + } > > + else > > + { > > + > > + class expand_operand ops[2]; > > + create_output_operand (&ops[0], reg, vector_mode); > > + create_input_operand (&ops[1], data, QImode); > > + expand_insn (icode, 2, ops); > > + if (!rtx_equal_p (reg, ops[0].value)) > > + emit_move_insn (reg, ops[0].value); > > + } > > + target = lowpart_subreg (mode, reg, vector_mode); > > + } > > + > > + return target; > > +} > > + > > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > + bytes from constant string DATA + OFFSET and return it as target > > + constant. If PREV isn't nullptr, it has the RTL info from the > > + previous iteration. */ > > > > +rtx > > +builtin_memset_read_str (void *data, void *prev, > > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > + scalar_int_mode mode) > > +{ > > + rtx target; > > const char *c = (const char *) data; > > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > + char *p; > > + > > + /* Don't use the previous value if size is 1. */ > > + if (GET_MODE_SIZE (mode) != 1) > > + { > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); > > + memset (p, *c, GET_MODE_SIZE (QImode)); > > + rtx src = c_readstr (p, QImode); > > + target = gen_memset_broadcast (src, mode); > > + if (target != nullptr) > > + return target; > > + } > > + > > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); > > > > memset (p, *c, GET_MODE_SIZE (mode)); > > > > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp, > > nullptr, it has the RTL info from the previous iteration. */ > > > > static rtx > > -builtin_memset_gen_str (void *data, void *prevp, > > +builtin_memset_gen_str (void *data, void *prev, > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > scalar_int_mode mode) > > { > > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp, > > size_t size; > > char *p; > > > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > > - if (prev != nullptr && prev->data != nullptr) > > - { > > - /* Use the previous data in the same mode. */ > > - if (prev->mode == mode) > > - return prev->data; > > - > > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); > > - if (target != nullptr) > > - return target; > > - } > > - > > size = GET_MODE_SIZE (mode); > > if (size == 1) > > return (rtx) data; > > > > + target = gen_memset_value_from_prev (prev, mode); > > + if (target != nullptr) > > + return target; > > + > > + target = gen_memset_broadcast ((rtx) data, mode); > > + if (target != nullptr) > > + return target; > > + > > p = XALLOCAVEC (char, size); > > memset (p, 1, size); > > coeff = c_readstr (p, mode); > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi > > index 2a41ae5fba1..8849711fcf8 100644 > > --- a/gcc/doc/tm.texi > > +++ b/gcc/doc/tm.texi > > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence > > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. > > @end deftypefn > > > > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) > > +This hook should return an rtx for scratch register in @var{mode} to > > +be used by memset broadcast. The default is @code{gen_reg_rtx}. > > +@end deftypefn > > + > > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) > > This target hook returns a new value for the number of times @var{loop} > > should be unrolled. The parameter @var{nunroll} is the number of times > > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in > > index f881cdabe9e..a6bbf4f2667 100644 > > --- a/gcc/doc/tm.texi.in > > +++ b/gcc/doc/tm.texi.in > > @@ -7958,6 +7958,8 @@ lists. > > > > @hook TARGET_GEN_CCMP_NEXT > > > > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX > > + > > @hook TARGET_LOOP_UNROLL_ADJUST > > > > @defmac POWI_MAX_MULTS > > diff --git a/gcc/target.def b/gcc/target.def > > index c009671c583..5fb287db3bd 100644 > > --- a/gcc/target.def > > +++ b/gcc/target.def > > @@ -2726,6 +2726,13 @@ DEFHOOK > > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), > > NULL) > > > > +DEFHOOK > > +(gen_memset_scratch_rtx, > > + "This hook should return an rtx for scratch register in @var{mode} to\n\ > > +be used by memset broadcast. The default is @code{gen_reg_rtx}.", > > + rtx, (machine_mode mode), > > + gen_reg_rtx) > > + > > /* Return a new value for loop unroll size. */ > > DEFHOOK > > (loop_unroll_adjust,
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard >> > scratch register to avoid stack realignment when expanding memset. >> > >> > PR middle-end/90773 >> > * builtins.c (gen_memset_value_from_prev): New function. >> > (gen_memset_broadcast): Likewise. >> > (builtin_memset_read_str): Use gen_memset_value_from_prev >> > and gen_memset_broadcast. >> > (builtin_memset_gen_str): Likewise. >> > * target.def (gen_memset_scratch_rtx): New hook. >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. >> > * doc/tm.texi: Regenerated. >> > --- >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- >> > gcc/doc/tm.texi | 5 ++ >> > gcc/doc/tm.texi.in | 2 + >> > gcc/target.def | 7 +++ >> > 4 files changed, 116 insertions(+), 21 deletions(-) >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c >> > index 39ab139b7e1..c1758ae2efc 100644 >> > --- a/gcc/builtins.c >> > +++ b/gcc/builtins.c >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) >> > return NULL_RTX; >> > } >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >> > - bytes from constant string DATA + OFFSET and return it as target >> > - constant. If PREV isn't nullptr, it has the RTL info from the >> > +/* Return the RTL of a register in MODE generated from PREV in the >> > previous iteration. */ >> > >> > -rtx >> > -builtin_memset_read_str (void *data, void *prevp, >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > - scalar_int_mode mode) >> > +static rtx >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) >> > { >> > + rtx target = nullptr; >> > by_pieces_prev *prev = (by_pieces_prev *) prevp; >> > if (prev != nullptr && prev->data != nullptr) >> > { >> > /* Use the previous data in the same mode. */ >> > if (prev->mode == mode) >> > return prev->data; >> > + >> > + rtx prev_rtx = prev->data; >> > + machine_mode prev_mode = prev->mode; >> > + unsigned int word_size = GET_MODE_SIZE (word_mode); >> > + if (word_size < GET_MODE_SIZE (prev->mode) >> > + && word_size > GET_MODE_SIZE (mode)) >> > + { >> > + /* First generate subreg of word mode if the previous mode is >> > + wider than word mode and word mode is wider than MODE. */ >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, >> > + prev_mode, 0); >> > + prev_mode = word_mode; >> > + } >> > + if (prev_rtx != nullptr) >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); >> > } >> > + return target; >> > +} >> > + >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */ >> > + >> > +static rtx >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode) >> > +{ >> > + /* Skip if regno_reg_rtx isn't initialized. */ >> > + if (!regno_reg_rtx) >> > + return nullptr; >> > + >> > + rtx target = nullptr; >> > + >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); >> > + machine_mode vector_mode; >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) >> > + gcc_unreachable (); >> >> Sorry, I realise it's a bit late to be raising this objection now, >> but I don't think it's a good idea to use scalar integer modes as >> a proxy for vector modes. In principle there's no reason why a >> target has to define an integer mode for every vector mode. > > A target always defines the largest integer mode. Right. But a target shouldn't *need* to define an integer mode for every vector mode. >> If we want the mode to be a vector then I think the by-pieces >> infrastructure should be extended to support vectors directly, >> rather than assuming that each piece can be represented as >> a scalar_int_mode. >> > > The current by-pieces infrastructure operates on scalar_int_mode. > Only for memset, there is > > /* Callback routine for store_by_pieces. Return the RTL of a register > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned > char value given in the RTL register data. For example, if mode is > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't > nullptr, it has the RTL info from the previous iteration. */ > > static rtx > builtin_memset_gen_str (void *data, void *prevp, > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > scalar_int_mode mode) > > It is a broadcast. If a target can broadcast a byte to a wider integer, > can you suggest a way to use it in the current by-pieces infrastructure? I meant that we should extend the by-pieces infrastructure so that it no longer requires scalar_int_mode, rather than work within the current limits. IMO the fact that we're (legitimately) going through vec_duplicate_optab shows that we really are dealing with vectors here, not integers. Thanks, Richard
On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value. > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard > >> > scratch register to avoid stack realignment when expanding memset. > >> > > >> > PR middle-end/90773 > >> > * builtins.c (gen_memset_value_from_prev): New function. > >> > (gen_memset_broadcast): Likewise. > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev > >> > and gen_memset_broadcast. > >> > (builtin_memset_gen_str): Likewise. > >> > * target.def (gen_memset_scratch_rtx): New hook. > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX. > >> > * doc/tm.texi: Regenerated. > >> > --- > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++-------- > >> > gcc/doc/tm.texi | 5 ++ > >> > gcc/doc/tm.texi.in | 2 + > >> > gcc/target.def | 7 +++ > >> > 4 files changed, 116 insertions(+), 21 deletions(-) > >> > > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >> > index 39ab139b7e1..c1758ae2efc 100644 > >> > --- a/gcc/builtins.c > >> > +++ b/gcc/builtins.c > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) > >> > return NULL_RTX; > >> > } > >> > > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >> > - bytes from constant string DATA + OFFSET and return it as target > >> > - constant. If PREV isn't nullptr, it has the RTL info from the > >> > +/* Return the RTL of a register in MODE generated from PREV in the > >> > previous iteration. */ > >> > > >> > -rtx > >> > -builtin_memset_read_str (void *data, void *prevp, > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > - scalar_int_mode mode) > >> > +static rtx > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) > >> > { > >> > + rtx target = nullptr; > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp; > >> > if (prev != nullptr && prev->data != nullptr) > >> > { > >> > /* Use the previous data in the same mode. */ > >> > if (prev->mode == mode) > >> > return prev->data; > >> > + > >> > + rtx prev_rtx = prev->data; > >> > + machine_mode prev_mode = prev->mode; > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode); > >> > + if (word_size < GET_MODE_SIZE (prev->mode) > >> > + && word_size > GET_MODE_SIZE (mode)) > >> > + { > >> > + /* First generate subreg of word mode if the previous mode is > >> > + wider than word mode and word mode is wider than MODE. */ > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, > >> > + prev_mode, 0); > >> > + prev_mode = word_mode; > >> > + } > >> > + if (prev_rtx != nullptr) > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); > >> > } > >> > + return target; > >> > +} > >> > + > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */ > >> > + > >> > +static rtx > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode) > >> > +{ > >> > + /* Skip if regno_reg_rtx isn't initialized. */ > >> > + if (!regno_reg_rtx) > >> > + return nullptr; > >> > + > >> > + rtx target = nullptr; > >> > + > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); > >> > + machine_mode vector_mode; > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) > >> > + gcc_unreachable (); > >> > >> Sorry, I realise it's a bit late to be raising this objection now, > >> but I don't think it's a good idea to use scalar integer modes as > >> a proxy for vector modes. In principle there's no reason why a > >> target has to define an integer mode for every vector mode. > > > > A target always defines the largest integer mode. > > Right. But a target shouldn't *need* to define an integer mode > for every vector mode. > > >> If we want the mode to be a vector then I think the by-pieces > >> infrastructure should be extended to support vectors directly, > >> rather than assuming that each piece can be represented as > >> a scalar_int_mode. > >> > > > > The current by-pieces infrastructure operates on scalar_int_mode. > > Only for memset, there is > > > > /* Callback routine for store_by_pieces. Return the RTL of a register > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned > > char value given in the RTL register data. For example, if mode is > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't > > nullptr, it has the RTL info from the previous iteration. */ > > > > static rtx > > builtin_memset_gen_str (void *data, void *prevp, > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > scalar_int_mode mode) > > > > It is a broadcast. If a target can broadcast a byte to a wider integer, > > can you suggest a way to use it in the current by-pieces infrastructure? > > I meant that we should extend the by-pieces infrastructure so that > it no longer requires scalar_int_mode, rather than work within the > current limits. > > IMO the fact that we're (legitimately) going through vec_duplicate_optab If vec_duplicate_optab is the only blocking issue, can we add an integer broadcast optab? I want to avoid a major surgery just because we don't support integer broadcast from byte. > shows that we really are dealing with vectors here, not integers. > Are you suggesting we should add the optional vector mode support to by-pieces for memset?
diff --git a/gcc/builtins.c b/gcc/builtins.c index 39ab139b7e1..c1758ae2efc 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target) return NULL_RTX; } -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) - bytes from constant string DATA + OFFSET and return it as target - constant. If PREV isn't nullptr, it has the RTL info from the +/* Return the RTL of a register in MODE generated from PREV in the previous iteration. */ -rtx -builtin_memset_read_str (void *data, void *prevp, - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) +static rtx +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode) { + rtx target = nullptr; by_pieces_prev *prev = (by_pieces_prev *) prevp; if (prev != nullptr && prev->data != nullptr) { /* Use the previous data in the same mode. */ if (prev->mode == mode) return prev->data; + + rtx prev_rtx = prev->data; + machine_mode prev_mode = prev->mode; + unsigned int word_size = GET_MODE_SIZE (word_mode); + if (word_size < GET_MODE_SIZE (prev->mode) + && word_size > GET_MODE_SIZE (mode)) + { + /* First generate subreg of word mode if the previous mode is + wider than word mode and word mode is wider than MODE. */ + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx, + prev_mode, 0); + prev_mode = word_mode; + } + if (prev_rtx != nullptr) + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0); } + return target; +} + +/* Return the RTL of a register in MODE broadcasted from DATA. */ + +static rtx +gen_memset_broadcast (rtx data, scalar_int_mode mode) +{ + /* Skip if regno_reg_rtx isn't initialized. */ + if (!regno_reg_rtx) + return nullptr; + + rtx target = nullptr; + + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode); + machine_mode vector_mode; + if (!mode_for_vector (QImode, nunits).exists (&vector_mode)) + gcc_unreachable (); + + enum insn_code icode = optab_handler (vec_duplicate_optab, + vector_mode); + if (icode != CODE_FOR_nothing) + { + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode); + if (CONST_INT_P (data)) + { + /* Use the move expander with CONST_VECTOR. */ + rtx const_vec = gen_const_vec_duplicate (vector_mode, data); + emit_move_insn (reg, const_vec); + } + else + { + + class expand_operand ops[2]; + create_output_operand (&ops[0], reg, vector_mode); + create_input_operand (&ops[1], data, QImode); + expand_insn (icode, 2, ops); + if (!rtx_equal_p (reg, ops[0].value)) + emit_move_insn (reg, ops[0].value); + } + target = lowpart_subreg (mode, reg, vector_mode); + } + + return target; +} + +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) + bytes from constant string DATA + OFFSET and return it as target + constant. If PREV isn't nullptr, it has the RTL info from the + previous iteration. */ +rtx +builtin_memset_read_str (void *data, void *prev, + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, + scalar_int_mode mode) +{ + rtx target; const char *c = (const char *) data; - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); + char *p; + + /* Don't use the previous value if size is 1. */ + if (GET_MODE_SIZE (mode) != 1) + { + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode)); + memset (p, *c, GET_MODE_SIZE (QImode)); + rtx src = c_readstr (p, QImode); + target = gen_memset_broadcast (src, mode); + if (target != nullptr) + return target; + } + + p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); memset (p, *c, GET_MODE_SIZE (mode)); @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp, nullptr, it has the RTL info from the previous iteration. */ static rtx -builtin_memset_gen_str (void *data, void *prevp, +builtin_memset_gen_str (void *data, void *prev, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, scalar_int_mode mode) { @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp, size_t size; char *p; - by_pieces_prev *prev = (by_pieces_prev *) prevp; - if (prev != nullptr && prev->data != nullptr) - { - /* Use the previous data in the same mode. */ - if (prev->mode == mode) - return prev->data; - - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); - if (target != nullptr) - return target; - } - size = GET_MODE_SIZE (mode); if (size == 1) return (rtx) data; + target = gen_memset_value_from_prev (prev, mode); + if (target != nullptr) + return target; + + target = gen_memset_broadcast ((rtx) data, mode); + if (target != nullptr) + return target; + p = XALLOCAVEC (char, size); memset (p, 1, size); coeff = c_readstr (p, mode); diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 2a41ae5fba1..8849711fcf8 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) +This hook should return an rtx for scratch register in @var{mode} to +be used by memset broadcast. The default is @code{gen_reg_rtx}. +@end deftypefn + @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) This target hook returns a new value for the number of times @var{loop} should be unrolled. The parameter @var{nunroll} is the number of times diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index f881cdabe9e..a6bbf4f2667 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7958,6 +7958,8 @@ lists. @hook TARGET_GEN_CCMP_NEXT +@hook TARGET_GEN_MEMSET_SCRATCH_RTX + @hook TARGET_LOOP_UNROLL_ADJUST @defmac POWI_MAX_MULTS diff --git a/gcc/target.def b/gcc/target.def index c009671c583..5fb287db3bd 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2726,6 +2726,13 @@ DEFHOOK rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), NULL) +DEFHOOK +(gen_memset_scratch_rtx, + "This hook should return an rtx for scratch register in @var{mode} to\n\ +be used by memset broadcast. The default is @code{gen_reg_rtx}.", + rtx, (machine_mode mode), + gen_reg_rtx) + /* Return a new value for loop unroll size. */ DEFHOOK (loop_unroll_adjust,