Message ID | 20210720185058.244593-1-hjl.tools@gmail.com |
---|---|
State | New |
Headers | show |
Series | [v3] Add QI vector mode support to by-pieces for memset | expand |
"H.J. Lu" <hjl.tools@gmail.com> writes: > diff --git a/gcc/builtins.c b/gcc/builtins.c > index 39ab139b7e1..1972301ce3c 100644 > --- a/gcc/builtins.c > +++ b/gcc/builtins.c > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > > static rtx > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > - scalar_int_mode mode) > + fixed_size_mode mode) > { > /* The REPresentation pointed to by DATA need not be a nul-terminated > string but the caller guarantees it's large enough for MODE. */ > const char *rep = (const char *) data; > > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > + /* NB: Vector mode in the by-pieces infrastructure is only used by > + the memset expander. */ Sorry to nitpick, but I guess this might get out out-of-date. Maybe: /* The by-pieces infrastructure does not try to pick a vector mode for memcpy expansion. */ > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > + /*nul_terminated=*/false); > } > > /* LEN specify length of the block of memcpy/memset operation. > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > > rtx > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > - scalar_int_mode mode) > + fixed_size_mode mode) > { > const char *str = (const char *) data; > > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > return const0_rtx; > > - return c_readstr (str + offset, mode); > + /* NB: Vector mode in the by-pieces infrastructure is only used by > + the memset expander. */ Similarly here for strncpy expansion. > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > } > > /* Helper to check the sizes of sequences and the destination of calls > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > return NULL_RTX; > } > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > - bytes from constant string DATA + OFFSET and return it as target > - constant. If PREV isn't nullptr, it has the RTL info from the > +/* Return the RTL of a register in MODE generated from PREV in the > previous iteration. */ > > -rtx > -builtin_memset_read_str (void *data, void *prevp, > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > - scalar_int_mode mode) > +static rtx > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > { > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > + rtx target = nullptr; > if (prev != nullptr && prev->data != nullptr) > { > /* Use the previous data in the same mode. */ > if (prev->mode == mode) > return prev->data; > + > + fixed_size_mode prev_mode = prev->mode; > + > + /* Don't use the previous data to write QImode if it is in a > + vector mode. */ > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > + return target; > + > + rtx prev_rtx = prev->data; > + > + if (REG_P (prev_rtx) > + && HARD_REGISTER_P (prev_rtx) > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > + { > + /* If we can't put a hard register in MODE, first generate a > + subreg of word mode if the previous mode is wider than word > + mode and word mode is wider than MODE. */ > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > + { > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > + prev_mode); > + if (prev_rtx != nullptr) > + prev_mode = word_mode; > + } > + else > + prev_rtx = nullptr; I don't understand this. Why not just do the: if (REG_P (prev_rtx) && HARD_REGISTER_P (prev_rtx) && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) prev_rtx = copy_to_reg (prev_rtx); that I suggested in the previous review? IMO anything that relies on a sequence of two subreg operations is doing something wrong. > + } > + if (prev_rtx != nullptr) > + target = lowpart_subreg (mode, prev_rtx, prev_mode); > } > + return target; > +} > + > […] > @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) > return align; > } > > -/* Return the widest integer mode that is narrower than SIZE bytes. */ > +/* Return the widest QI vector, if QI_MODE is true, or integer mode > + that is narrower than SIZE bytes. */ > > -static scalar_int_mode > -widest_int_mode_for_size (unsigned int size) > +static fixed_size_mode > +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector) > { > - scalar_int_mode result = NARROWEST_INT_MODE; > + machine_mode result = NARROWEST_INT_MODE; > > gcc_checking_assert (size > 1); > > + /* Use QI vector only if size is wider than a WORD. */ > + if (qi_vector && size > UNITS_PER_WORD) > + { > + machine_mode mode; > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > + if (GET_MODE_INNER (mode) == QImode > + && GET_MODE_SIZE (mode).is_constant ()) > + { > + if (GET_MODE_SIZE (mode).to_constant () >= size) > + break; > + if (optab_handler (vec_duplicate_optab, mode) > + != CODE_FOR_nothing) > + result = mode; > + } > + > + if (result != NARROWEST_INT_MODE) > + return as_a <fixed_size_mode> (result); > + } > + > opt_scalar_int_mode tmode; > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) > if (GET_MODE_SIZE (tmode.require ()) < size) > result = tmode.require (); > > - return result; > + return as_a <fixed_size_mode> (result); > } A less cast-heavy way to write this would be: fixed_size_mode result = NARROWEST_INT_MODE; gcc_checking_assert (size > 1); /* Use QI vector only if size is wider than a WORD. */ if (qi_vector && size > UNITS_PER_WORD) { machine_mode mode; fixed_size_mode candidate; FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) if (is_a<fixed_size_mode> (mode, &candidate) && GET_MODE_INNER (candidate) == QImode) { if (GET_MODE_SIZE (candidate) >= size) break; if (optab_handler (vec_duplicate_optab, candidate) != CODE_FOR_nothing) result = candidate; } if (result != NARROWEST_INT_MODE) return result; } opt_scalar_int_mode tmode; FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) if (GET_MODE_SIZE (tmode.require ()) < size) result = tmode.require (); return result; > @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) > size = GET_MODE_SIZE (mode); > if (len >= size && prepare_mode (mode, m_align)) > break; > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > - mode = widest_int_mode_for_size (size); > + /* NB: widest_fixed_size_mode_for_size checks SIZE > 1. */ > + mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode); > } > while (1); > return mode; > } > > +/* Return the smallest integer or QI vector mode that is not narrower > + than SIZE bytes. */ > + > +fixed_size_mode > +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size) > +{ > + /* Use QI vector only for > size of WORD. */ > + if (m_qi_vector_mode && size > UNITS_PER_WORD) > + { > + machine_mode mode; > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > + if (GET_MODE_INNER (mode) == QImode > + && GET_MODE_SIZE (mode).is_constant ()) > + { > + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); > + > + /* NB: Don't return a mode wider than M_LEN. */ > + if (mode_size > m_len) > + break; > + > + if (mode_size >= size > + && (optab_handler (vec_duplicate_optab, mode) > + != CODE_FOR_nothing)) > + return as_a <fixed_size_mode> (mode); > + } > + } Same idea here. I realise this is pre-existing, but the heavy use of “NB:” seems a bit odd. I don't think it's something that the patch needs to adopt. Otherwise it looks good, thanks. I think the main sticking point is the subreg thing. Richard
On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu" <hjl.tools@gmail.com> writes: > > diff --git a/gcc/builtins.c b/gcc/builtins.c > > index 39ab139b7e1..1972301ce3c 100644 > > --- a/gcc/builtins.c > > +++ b/gcc/builtins.c > > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > > > > static rtx > > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > > - scalar_int_mode mode) > > + fixed_size_mode mode) > > { > > /* The REPresentation pointed to by DATA need not be a nul-terminated > > string but the caller guarantees it's large enough for MODE. */ > > const char *rep = (const char *) data; > > > > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > > + /* NB: Vector mode in the by-pieces infrastructure is only used by > > + the memset expander. */ > > Sorry to nitpick, but I guess this might get out out-of-date. Maybe: > > /* The by-pieces infrastructure does not try to pick a vector mode > for memcpy expansion. */ Fixed. > > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > > + /*nul_terminated=*/false); > > } > > > > /* LEN specify length of the block of memcpy/memset operation. > > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > > > > rtx > > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > > - scalar_int_mode mode) > > + fixed_size_mode mode) > > { > > const char *str = (const char *) data; > > > > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > > return const0_rtx; > > > > - return c_readstr (str + offset, mode); > > + /* NB: Vector mode in the by-pieces infrastructure is only used by > > + the memset expander. */ > > Similarly here for strncpy expansion. Fixed. > > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > > } > > > > /* Helper to check the sizes of sequences and the destination of calls > > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > > return NULL_RTX; > > } > > > > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > > - bytes from constant string DATA + OFFSET and return it as target > > - constant. If PREV isn't nullptr, it has the RTL info from the > > +/* Return the RTL of a register in MODE generated from PREV in the > > previous iteration. */ > > > > -rtx > > -builtin_memset_read_str (void *data, void *prevp, > > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > > - scalar_int_mode mode) > > +static rtx > > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > > { > > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > > + rtx target = nullptr; > > if (prev != nullptr && prev->data != nullptr) > > { > > /* Use the previous data in the same mode. */ > > if (prev->mode == mode) > > return prev->data; > > + > > + fixed_size_mode prev_mode = prev->mode; > > + > > + /* Don't use the previous data to write QImode if it is in a > > + vector mode. */ > > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > > + return target; > > + > > + rtx prev_rtx = prev->data; > > + > > + if (REG_P (prev_rtx) > > + && HARD_REGISTER_P (prev_rtx) > > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > > + { > > + /* If we can't put a hard register in MODE, first generate a > > + subreg of word mode if the previous mode is wider than word > > + mode and word mode is wider than MODE. */ > > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > > + { > > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > > + prev_mode); > > + if (prev_rtx != nullptr) > > + prev_mode = word_mode; > > + } > > + else > > + prev_rtx = nullptr; > > I don't understand this. Why not just do the: > > if (REG_P (prev_rtx) > && HARD_REGISTER_P (prev_rtx) > && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > prev_rtx = copy_to_reg (prev_rtx); > > that I suggested in the previous review? But for --- extern void *ops; void foo (int c) { __builtin_memset (ops, c, 18); } --- I got vpbroadcastb %edi, %xmm31 vmovdqa64 %xmm31, -24(%rsp) movq ops(%rip), %rax movzwl -24(%rsp), %edx vmovdqu8 %xmm31, (%rax) movw %dx, 16(%rax) ret I want to avoid store and load. I am testing if (REG_P (prev_rtx) && HARD_REGISTER_P (prev_rtx) && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) { /* Find the smallest subreg mode in the same mode class which is not narrower than MODE and narrower than PREV_MODE. */ machine_mode m; fixed_size_mode candidate; FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) if (is_a<fixed_size_mode> (m, &candidate)) { if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (prev_mode)) break; if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, candidate) >= 0) { target = lowpart_subreg (candidate, prev_rtx, prev_mode); prev_rtx = target; prev_mode = candidate; break; } } if (target == nullptr) prev_rtx = copy_to_reg (prev_rtx); } to get movq ops(%rip), %rax vpbroadcastb %edi, %xmm31 vmovd %xmm31, %edx movw %dx, 16(%rax) vmovdqu8 %xmm31, (%rax) ret > IMO anything that relies on a sequence of two subreg operations is > doing something wrong. > > > + } > > + if (prev_rtx != nullptr) > > + target = lowpart_subreg (mode, prev_rtx, prev_mode); > > } > > + return target; > > +} > > + > > […] > > @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) > > return align; > > } > > > > -/* Return the widest integer mode that is narrower than SIZE bytes. */ > > +/* Return the widest QI vector, if QI_MODE is true, or integer mode > > + that is narrower than SIZE bytes. */ > > > > -static scalar_int_mode > > -widest_int_mode_for_size (unsigned int size) > > +static fixed_size_mode > > +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector) > > { > > - scalar_int_mode result = NARROWEST_INT_MODE; > > + machine_mode result = NARROWEST_INT_MODE; > > > > gcc_checking_assert (size > 1); > > > > + /* Use QI vector only if size is wider than a WORD. */ > > + if (qi_vector && size > UNITS_PER_WORD) > > + { > > + machine_mode mode; > > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > > + if (GET_MODE_INNER (mode) == QImode > > + && GET_MODE_SIZE (mode).is_constant ()) > > + { > > + if (GET_MODE_SIZE (mode).to_constant () >= size) > > + break; > > + if (optab_handler (vec_duplicate_optab, mode) > > + != CODE_FOR_nothing) > > + result = mode; > > + } > > + > > + if (result != NARROWEST_INT_MODE) > > + return as_a <fixed_size_mode> (result); > > + } > > + > > opt_scalar_int_mode tmode; > > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) > > if (GET_MODE_SIZE (tmode.require ()) < size) > > result = tmode.require (); > > > > - return result; > > + return as_a <fixed_size_mode> (result); > > } > > A less cast-heavy way to write this would be: > > fixed_size_mode result = NARROWEST_INT_MODE; > > gcc_checking_assert (size > 1); > > /* Use QI vector only if size is wider than a WORD. */ > if (qi_vector && size > UNITS_PER_WORD) > { > machine_mode mode; > fixed_size_mode candidate; > FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > if (is_a<fixed_size_mode> (mode, &candidate) > && GET_MODE_INNER (candidate) == QImode) > { > if (GET_MODE_SIZE (candidate) >= size) > break; > if (optab_handler (vec_duplicate_optab, candidate) > != CODE_FOR_nothing) > result = candidate; > } > > if (result != NARROWEST_INT_MODE) > return result; > } Fixed. > opt_scalar_int_mode tmode; > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) > if (GET_MODE_SIZE (tmode.require ()) < size) > result = tmode.require (); > > return result; > > > @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) > > size = GET_MODE_SIZE (mode); > > if (len >= size && prepare_mode (mode, m_align)) > > break; > > - /* NB: widest_int_mode_for_size checks SIZE > 1. */ > > - mode = widest_int_mode_for_size (size); > > + /* NB: widest_fixed_size_mode_for_size checks SIZE > 1. */ > > + mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode); > > } > > while (1); > > return mode; > > } > > > > +/* Return the smallest integer or QI vector mode that is not narrower > > + than SIZE bytes. */ > > + > > +fixed_size_mode > > +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size) > > +{ > > + /* Use QI vector only for > size of WORD. */ > > + if (m_qi_vector_mode && size > UNITS_PER_WORD) > > + { > > + machine_mode mode; > > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) > > + if (GET_MODE_INNER (mode) == QImode > > + && GET_MODE_SIZE (mode).is_constant ()) > > + { > > + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); > > + > > + /* NB: Don't return a mode wider than M_LEN. */ > > + if (mode_size > m_len) > > + break; > > + > > + if (mode_size >= size > > + && (optab_handler (vec_duplicate_optab, mode) > > + != CODE_FOR_nothing)) > > + return as_a <fixed_size_mode> (mode); > > + } > > + } > > Same idea here. Fixed. > I realise this is pre-existing, but the heavy use of “NB:” seems a bit odd. > I don't think it's something that the patch needs to adopt. Noted. > Otherwise it looks good, thanks. I think the main sticking point is > the subreg thing. > > Richard > > Thanks.
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > <richard.sandiford@arm.com> wrote: >> >> "H.J. Lu" <hjl.tools@gmail.com> writes: >> > diff --git a/gcc/builtins.c b/gcc/builtins.c >> > index 39ab139b7e1..1972301ce3c 100644 >> > --- a/gcc/builtins.c >> > +++ b/gcc/builtins.c >> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) >> > >> > static rtx >> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, >> > - scalar_int_mode mode) >> > + fixed_size_mode mode) >> > { >> > /* The REPresentation pointed to by DATA need not be a nul-terminated >> > string but the caller guarantees it's large enough for MODE. */ >> > const char *rep = (const char *) data; >> > >> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >> > + the memset expander. */ >> >> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: >> >> /* The by-pieces infrastructure does not try to pick a vector mode >> for memcpy expansion. */ > > Fixed. > >> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), >> > + /*nul_terminated=*/false); >> > } >> > >> > /* LEN specify length of the block of memcpy/memset operation. >> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) >> > >> > rtx >> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, >> > - scalar_int_mode mode) >> > + fixed_size_mode mode) >> > { >> > const char *str = (const char *) data; >> > >> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) >> > return const0_rtx; >> > >> > - return c_readstr (str + offset, mode); >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >> > + the memset expander. */ >> >> Similarly here for strncpy expansion. > > Fixed. > >> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); >> > } >> > >> > /* Helper to check the sizes of sequences and the destination of calls >> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) >> > return NULL_RTX; >> > } >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >> > - bytes from constant string DATA + OFFSET and return it as target >> > - constant. If PREV isn't nullptr, it has the RTL info from the >> > +/* Return the RTL of a register in MODE generated from PREV in the >> > previous iteration. */ >> > >> > -rtx >> > -builtin_memset_read_str (void *data, void *prevp, >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >> > - scalar_int_mode mode) >> > +static rtx >> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) >> > { >> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; >> > + rtx target = nullptr; >> > if (prev != nullptr && prev->data != nullptr) >> > { >> > /* Use the previous data in the same mode. */ >> > if (prev->mode == mode) >> > return prev->data; >> > + >> > + fixed_size_mode prev_mode = prev->mode; >> > + >> > + /* Don't use the previous data to write QImode if it is in a >> > + vector mode. */ >> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) >> > + return target; >> > + >> > + rtx prev_rtx = prev->data; >> > + >> > + if (REG_P (prev_rtx) >> > + && HARD_REGISTER_P (prev_rtx) >> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >> > + { >> > + /* If we can't put a hard register in MODE, first generate a >> > + subreg of word mode if the previous mode is wider than word >> > + mode and word mode is wider than MODE. */ >> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) >> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) >> > + { >> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, >> > + prev_mode); >> > + if (prev_rtx != nullptr) >> > + prev_mode = word_mode; >> > + } >> > + else >> > + prev_rtx = nullptr; >> >> I don't understand this. Why not just do the: >> >> if (REG_P (prev_rtx) >> && HARD_REGISTER_P (prev_rtx) >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >> prev_rtx = copy_to_reg (prev_rtx); >> >> that I suggested in the previous review? > > But for > --- > extern void *ops; > > void > foo (int c) > { > __builtin_memset (ops, c, 18); > } > --- > I got > > vpbroadcastb %edi, %xmm31 > vmovdqa64 %xmm31, -24(%rsp) > movq ops(%rip), %rax > movzwl -24(%rsp), %edx > vmovdqu8 %xmm31, (%rax) > movw %dx, 16(%rax) > ret > > I want to avoid store and load. I am testing > > if (REG_P (prev_rtx) > && HARD_REGISTER_P (prev_rtx) > && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > { > /* Find the smallest subreg mode in the same mode class which > is not narrower than MODE and narrower than PREV_MODE. */ > machine_mode m; > fixed_size_mode candidate; > FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > if (is_a<fixed_size_mode> (m, &candidate)) > { > if (GET_MODE_SIZE (candidate) > >= GET_MODE_SIZE (prev_mode)) > break; > if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > && lowpart_subreg_regno (REGNO (prev_rtx), > prev_mode, candidate) >= 0) > { > target = lowpart_subreg (candidate, prev_rtx, > prev_mode); > prev_rtx = target; > prev_mode = candidate; > break; > } > } That doesn't seem better though. There are still two subregs involved. > if (target == nullptr) > prev_rtx = copy_to_reg (prev_rtx); > } > > to get > > movq ops(%rip), %rax > vpbroadcastb %edi, %xmm31 > vmovd %xmm31, %edx > movw %dx, 16(%rax) > vmovdqu8 %xmm31, (%rax) > ret What does the pre-RA RTL look like for the code that you want? Thanks, Richard
On Wed, Jul 21, 2021 at 12:20 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > > <richard.sandiford@arm.com> wrote: > >> > >> "H.J. Lu" <hjl.tools@gmail.com> writes: > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >> > index 39ab139b7e1..1972301ce3c 100644 > >> > --- a/gcc/builtins.c > >> > +++ b/gcc/builtins.c > >> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > >> > > >> > static rtx > >> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >> > - scalar_int_mode mode) > >> > + fixed_size_mode mode) > >> > { > >> > /* The REPresentation pointed to by DATA need not be a nul-terminated > >> > string but the caller guarantees it's large enough for MODE. */ > >> > const char *rep = (const char *) data; > >> > > >> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >> > + the memset expander. */ > >> > >> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: > >> > >> /* The by-pieces infrastructure does not try to pick a vector mode > >> for memcpy expansion. */ > > > > Fixed. > > > >> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > >> > + /*nul_terminated=*/false); > >> > } > >> > > >> > /* LEN specify length of the block of memcpy/memset operation. > >> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > >> > > >> > rtx > >> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >> > - scalar_int_mode mode) > >> > + fixed_size_mode mode) > >> > { > >> > const char *str = (const char *) data; > >> > > >> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > >> > return const0_rtx; > >> > > >> > - return c_readstr (str + offset, mode); > >> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >> > + the memset expander. */ > >> > >> Similarly here for strncpy expansion. > > > > Fixed. > > > >> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > >> > } > >> > > >> > /* Helper to check the sizes of sequences and the destination of calls > >> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > >> > return NULL_RTX; > >> > } > >> > > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >> > - bytes from constant string DATA + OFFSET and return it as target > >> > - constant. If PREV isn't nullptr, it has the RTL info from the > >> > +/* Return the RTL of a register in MODE generated from PREV in the > >> > previous iteration. */ > >> > > >> > -rtx > >> > -builtin_memset_read_str (void *data, void *prevp, > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >> > - scalar_int_mode mode) > >> > +static rtx > >> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > >> > { > >> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > >> > + rtx target = nullptr; > >> > if (prev != nullptr && prev->data != nullptr) > >> > { > >> > /* Use the previous data in the same mode. */ > >> > if (prev->mode == mode) > >> > return prev->data; > >> > + > >> > + fixed_size_mode prev_mode = prev->mode; > >> > + > >> > + /* Don't use the previous data to write QImode if it is in a > >> > + vector mode. */ > >> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > >> > + return target; > >> > + > >> > + rtx prev_rtx = prev->data; > >> > + > >> > + if (REG_P (prev_rtx) > >> > + && HARD_REGISTER_P (prev_rtx) > >> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> > + { > >> > + /* If we can't put a hard register in MODE, first generate a > >> > + subreg of word mode if the previous mode is wider than word > >> > + mode and word mode is wider than MODE. */ > >> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > >> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > >> > + { > >> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > >> > + prev_mode); > >> > + if (prev_rtx != nullptr) > >> > + prev_mode = word_mode; > >> > + } > >> > + else > >> > + prev_rtx = nullptr; > >> > >> I don't understand this. Why not just do the: > >> > >> if (REG_P (prev_rtx) > >> && HARD_REGISTER_P (prev_rtx) > >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> prev_rtx = copy_to_reg (prev_rtx); > >> > >> that I suggested in the previous review? > > > > But for > > --- > > extern void *ops; > > > > void > > foo (int c) > > { > > __builtin_memset (ops, c, 18); > > } > > --- > > I got > > > > vpbroadcastb %edi, %xmm31 > > vmovdqa64 %xmm31, -24(%rsp) > > movq ops(%rip), %rax > > movzwl -24(%rsp), %edx > > vmovdqu8 %xmm31, (%rax) > > movw %dx, 16(%rax) > > ret > > > > I want to avoid store and load. I am testing > > > > if (REG_P (prev_rtx) > > && HARD_REGISTER_P (prev_rtx) > > && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > > { > > /* Find the smallest subreg mode in the same mode class which > > is not narrower than MODE and narrower than PREV_MODE. */ > > machine_mode m; > > fixed_size_mode candidate; > > FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > > if (is_a<fixed_size_mode> (m, &candidate)) > > { > > if (GET_MODE_SIZE (candidate) > > >= GET_MODE_SIZE (prev_mode)) > > break; > > if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > > && lowpart_subreg_regno (REGNO (prev_rtx), > > prev_mode, candidate) >= 0) > > { > > target = lowpart_subreg (candidate, prev_rtx, > > prev_mode); > > prev_rtx = target; > > prev_mode = candidate; > > break; > > } > > } > > That doesn't seem better though. There are still two subregs involved. Since a hard register is used, there is only one subreg generated: (insn 7 6 8 2 (set (reg:QI 85) (subreg:QI (reg/v:SI 83 [ c ]) 0)) "s2a.i":6:3 77 {*movqi_internal} (nil)) (insn 8 7 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (reg:QI 85))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_g prv16qi} (nil)) (insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84) [0 MEM <char[1:18]> [(void *)ops.0_ 1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal} (nil)) > > if (target == nullptr) > > prev_rtx = copy_to_reg (prev_rtx); > > } > > > > to get > > > > movq ops(%rip), %rax > > vpbroadcastb %edi, %xmm31 > > vmovd %xmm31, %edx > > movw %dx, 16(%rax) > > vmovdqu8 %xmm31, (%rax) > > ret > > What does the pre-RA RTL look like for the code that you want? WIth my code, I got (insn 8 6 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (subreg:QI (reg:SI 86) 0))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_gprv16qi} (expr_list:REG_DEAD (reg:SI 86) (nil))) (insn 9 8 10 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM <char[1:18]> [(void *)ops.0_1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ]) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:SI 67 xmm31) 0)) "s2a.i":6:3 76 {*movhi_internal} (expr_list:REG_DEAD (reg/f:DI 84 [ ops ]) (expr_list:REG_DEAD (reg:SI 67 xmm31) (nil)))) With copy_to_reg, I got (insn 8 6 9 2 (set (reg:V16QI 67 xmm31) (vec_duplicate:V16QI (subreg:QI (reg:SI 87) 0))) "s2a.i":6:3 5165 {*avx512vl_vec_dup_gprv16qi} (expr_list:REG_DEAD (reg:SI 87) (nil))) (insn 9 8 15 2 (set (mem:V16QI (reg/f:DI 84 [ ops ]) [0 MEM <char[1:18]> [(void *)ops.0_1]+0 S16 A8]) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (nil)) (insn 15 9 10 2 (set (reg:V16QI 88) (reg:V16QI 67 xmm31)) "s2a.i":6:3 1574 {movv16qi_internal} (expr_list:REG_DEAD (reg:V16QI 67 xmm31) (nil))) (note 10 15 11 2 NOTE_INSN_DELETED) (insn 11 10 0 2 (set (mem:HI (plus:DI (reg/f:DI 84 [ ops ]) (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void *)ops.0_1]+16 S2 A8]) (subreg:HI (reg:V16QI 88) 0)) "s2a.i":6:3 76 {*movhi_internal} (expr_list:REG_DEAD (reg:V16QI 88) (expr_list:REG_DEAD (reg/f:DI 84 [ ops ]) (nil)))) > Thanks, > Richard
Richard Sandiford <richard.sandiford@arm.com> writes: > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: >> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford >> <richard.sandiford@arm.com> wrote: >>> >>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>> > diff --git a/gcc/builtins.c b/gcc/builtins.c >>> > index 39ab139b7e1..1972301ce3c 100644 >>> > --- a/gcc/builtins.c >>> > +++ b/gcc/builtins.c >>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) >>> > >>> > static rtx >>> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, >>> > - scalar_int_mode mode) >>> > + fixed_size_mode mode) >>> > { >>> > /* The REPresentation pointed to by DATA need not be a nul-terminated >>> > string but the caller guarantees it's large enough for MODE. */ >>> > const char *rep = (const char *) data; >>> > >>> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >>> > + the memset expander. */ >>> >>> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: >>> >>> /* The by-pieces infrastructure does not try to pick a vector mode >>> for memcpy expansion. */ >> >> Fixed. >> >>> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), >>> > + /*nul_terminated=*/false); >>> > } >>> > >>> > /* LEN specify length of the block of memcpy/memset operation. >>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) >>> > >>> > rtx >>> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, >>> > - scalar_int_mode mode) >>> > + fixed_size_mode mode) >>> > { >>> > const char *str = (const char *) data; >>> > >>> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) >>> > return const0_rtx; >>> > >>> > - return c_readstr (str + offset, mode); >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by >>> > + the memset expander. */ >>> >>> Similarly here for strncpy expansion. >> >> Fixed. >> >>> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); >>> > } >>> > >>> > /* Helper to check the sizes of sequences and the destination of calls >>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) >>> > return NULL_RTX; >>> > } >>> > >>> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) >>> > - bytes from constant string DATA + OFFSET and return it as target >>> > - constant. If PREV isn't nullptr, it has the RTL info from the >>> > +/* Return the RTL of a register in MODE generated from PREV in the >>> > previous iteration. */ >>> > >>> > -rtx >>> > -builtin_memset_read_str (void *data, void *prevp, >>> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, >>> > - scalar_int_mode mode) >>> > +static rtx >>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) >>> > { >>> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; >>> > + rtx target = nullptr; >>> > if (prev != nullptr && prev->data != nullptr) >>> > { >>> > /* Use the previous data in the same mode. */ >>> > if (prev->mode == mode) >>> > return prev->data; >>> > + >>> > + fixed_size_mode prev_mode = prev->mode; >>> > + >>> > + /* Don't use the previous data to write QImode if it is in a >>> > + vector mode. */ >>> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) >>> > + return target; >>> > + >>> > + rtx prev_rtx = prev->data; >>> > + >>> > + if (REG_P (prev_rtx) >>> > + && HARD_REGISTER_P (prev_rtx) >>> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >>> > + { >>> > + /* If we can't put a hard register in MODE, first generate a >>> > + subreg of word mode if the previous mode is wider than word >>> > + mode and word mode is wider than MODE. */ >>> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) >>> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) >>> > + { >>> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, >>> > + prev_mode); >>> > + if (prev_rtx != nullptr) >>> > + prev_mode = word_mode; >>> > + } >>> > + else >>> > + prev_rtx = nullptr; >>> >>> I don't understand this. Why not just do the: >>> >>> if (REG_P (prev_rtx) >>> && HARD_REGISTER_P (prev_rtx) >>> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >>> prev_rtx = copy_to_reg (prev_rtx); >>> >>> that I suggested in the previous review? >> >> But for >> --- >> extern void *ops; >> >> void >> foo (int c) >> { >> __builtin_memset (ops, c, 18); >> } >> --- >> I got >> >> vpbroadcastb %edi, %xmm31 >> vmovdqa64 %xmm31, -24(%rsp) >> movq ops(%rip), %rax >> movzwl -24(%rsp), %edx >> vmovdqu8 %xmm31, (%rax) >> movw %dx, 16(%rax) >> ret >> >> I want to avoid store and load. I am testing >> >> if (REG_P (prev_rtx) >> && HARD_REGISTER_P (prev_rtx) >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) >> { >> /* Find the smallest subreg mode in the same mode class which >> is not narrower than MODE and narrower than PREV_MODE. */ >> machine_mode m; >> fixed_size_mode candidate; >> FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) >> if (is_a<fixed_size_mode> (m, &candidate)) >> { >> if (GET_MODE_SIZE (candidate) >> >= GET_MODE_SIZE (prev_mode)) >> break; >> if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) >> && lowpart_subreg_regno (REGNO (prev_rtx), >> prev_mode, candidate) >= 0) >> { >> target = lowpart_subreg (candidate, prev_rtx, >> prev_mode); >> prev_rtx = target; >> prev_mode = candidate; >> break; >> } >> } > > That doesn't seem better though. There are still two subregs involved. Actually, I take that back. I guess it does make sense, and is definitely better than hard-coding word_mode. How about a comment along the lines of the following, instead of the one above: /* This case occurs when PREV_MODE is a vector and when MODE is too small to store using vector operations. After register allocation, the code will need to move the lowpart of the vector register into a non-vector register. Also, the target has chosen to use a hard register instead of going with the default choice of using a pseudo register. We should respect that choice and try to avoid creating a pseudo register with the same mode as the current hard register. In principle, we could just use a lowpart MODE subreg of the vector register. However, the vector register mode might be too wide for non-vector registers, and we already know that the non-vector mode is too small for vector registers. It's therefore likely that we'd need to spill to memory in the vector mode and reload the non-vector value from there. Try to avoid that by reducing the vector register to the smallest size that it can hold. This should increase the chances that non-vector registers can hold both the inner and outer modes of the subreg that we generate later. */ Thanks, Richard
On Wed, Jul 21, 2021 at 12:42 PM Richard Sandiford <richard.sandiford@arm.com> wrote: > > Richard Sandiford <richard.sandiford@arm.com> writes: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > >> <richard.sandiford@arm.com> wrote: > >>> > >>> "H.J. Lu" <hjl.tools@gmail.com> writes: > >>> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >>> > index 39ab139b7e1..1972301ce3c 100644 > >>> > --- a/gcc/builtins.c > >>> > +++ b/gcc/builtins.c > >>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) > >>> > > >>> > static rtx > >>> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >>> > - scalar_int_mode mode) > >>> > + fixed_size_mode mode) > >>> > { > >>> > /* The REPresentation pointed to by DATA need not be a nul-terminated > >>> > string but the caller guarantees it's large enough for MODE. */ > >>> > const char *rep = (const char *) data; > >>> > > >>> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >>> > + the memset expander. */ > >>> > >>> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: > >>> > >>> /* The by-pieces infrastructure does not try to pick a vector mode > >>> for memcpy expansion. */ > >> > >> Fixed. > >> > >>> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > >>> > + /*nul_terminated=*/false); > >>> > } > >>> > > >>> > /* LEN specify length of the block of memcpy/memset operation. > >>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > >>> > > >>> > rtx > >>> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >>> > - scalar_int_mode mode) > >>> > + fixed_size_mode mode) > >>> > { > >>> > const char *str = (const char *) data; > >>> > > >>> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > >>> > return const0_rtx; > >>> > > >>> > - return c_readstr (str + offset, mode); > >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >>> > + the memset expander. */ > >>> > >>> Similarly here for strncpy expansion. > >> > >> Fixed. > >> > >>> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > >>> > } > >>> > > >>> > /* Helper to check the sizes of sequences and the destination of calls > >>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > >>> > return NULL_RTX; > >>> > } > >>> > > >>> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >>> > - bytes from constant string DATA + OFFSET and return it as target > >>> > - constant. If PREV isn't nullptr, it has the RTL info from the > >>> > +/* Return the RTL of a register in MODE generated from PREV in the > >>> > previous iteration. */ > >>> > > >>> > -rtx > >>> > -builtin_memset_read_str (void *data, void *prevp, > >>> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >>> > - scalar_int_mode mode) > >>> > +static rtx > >>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > >>> > { > >>> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > >>> > + rtx target = nullptr; > >>> > if (prev != nullptr && prev->data != nullptr) > >>> > { > >>> > /* Use the previous data in the same mode. */ > >>> > if (prev->mode == mode) > >>> > return prev->data; > >>> > + > >>> > + fixed_size_mode prev_mode = prev->mode; > >>> > + > >>> > + /* Don't use the previous data to write QImode if it is in a > >>> > + vector mode. */ > >>> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > >>> > + return target; > >>> > + > >>> > + rtx prev_rtx = prev->data; > >>> > + > >>> > + if (REG_P (prev_rtx) > >>> > + && HARD_REGISTER_P (prev_rtx) > >>> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >>> > + { > >>> > + /* If we can't put a hard register in MODE, first generate a > >>> > + subreg of word mode if the previous mode is wider than word > >>> > + mode and word mode is wider than MODE. */ > >>> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > >>> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > >>> > + { > >>> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > >>> > + prev_mode); > >>> > + if (prev_rtx != nullptr) > >>> > + prev_mode = word_mode; > >>> > + } > >>> > + else > >>> > + prev_rtx = nullptr; > >>> > >>> I don't understand this. Why not just do the: > >>> > >>> if (REG_P (prev_rtx) > >>> && HARD_REGISTER_P (prev_rtx) > >>> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >>> prev_rtx = copy_to_reg (prev_rtx); > >>> > >>> that I suggested in the previous review? > >> > >> But for > >> --- > >> extern void *ops; > >> > >> void > >> foo (int c) > >> { > >> __builtin_memset (ops, c, 18); > >> } > >> --- > >> I got > >> > >> vpbroadcastb %edi, %xmm31 > >> vmovdqa64 %xmm31, -24(%rsp) > >> movq ops(%rip), %rax > >> movzwl -24(%rsp), %edx > >> vmovdqu8 %xmm31, (%rax) > >> movw %dx, 16(%rax) > >> ret > >> > >> I want to avoid store and load. I am testing > >> > >> if (REG_P (prev_rtx) > >> && HARD_REGISTER_P (prev_rtx) > >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> { > >> /* Find the smallest subreg mode in the same mode class which > >> is not narrower than MODE and narrower than PREV_MODE. */ > >> machine_mode m; > >> fixed_size_mode candidate; > >> FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > >> if (is_a<fixed_size_mode> (m, &candidate)) > >> { > >> if (GET_MODE_SIZE (candidate) > >> >= GET_MODE_SIZE (prev_mode)) > >> break; > >> if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > >> && lowpart_subreg_regno (REGNO (prev_rtx), > >> prev_mode, candidate) >= 0) > >> { > >> target = lowpart_subreg (candidate, prev_rtx, > >> prev_mode); > >> prev_rtx = target; > >> prev_mode = candidate; > >> break; > >> } > >> } > > > > That doesn't seem better though. There are still two subregs involved. > > Actually, I take that back. I guess it does make sense, and is > definitely better than hard-coding word_mode. How about a comment > along the lines of the following, instead of the one above: > > /* This case occurs when PREV_MODE is a vector and when > MODE is too small to store using vector operations. > After register allocation, the code will need to move the > lowpart of the vector register into a non-vector register. > > Also, the target has chosen to use a hard register > instead of going with the default choice of using a > pseudo register. We should respect that choice and try to > avoid creating a pseudo register with the same mode as the > current hard register. > > In principle, we could just use a lowpart MODE subreg of > the vector register. However, the vector register mode might > be too wide for non-vector registers, and we already know > that the non-vector mode is too small for vector registers. > It's therefore likely that we'd need to spill to memory in > the vector mode and reload the non-vector value from there. > > Try to avoid that by reducing the vector register to the > smallest size that it can hold. This should increase the > chances that non-vector registers can hold both the inner > and outer modes of the subreg that we generate later. */ Fixed. > Thanks, > Richard I sent the v4 patch. Thanks.
diff --git a/gcc/builtins.c b/gcc/builtins.c index 39ab139b7e1..1972301ce3c 100644 --- a/gcc/builtins.c +++ b/gcc/builtins.c @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode) static rtx builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + fixed_size_mode mode) { /* The REPresentation pointed to by DATA need not be a nul-terminated string but the caller guarantees it's large enough for MODE. */ const char *rep = (const char *) data; - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); + /* NB: Vector mode in the by-pieces infrastructure is only used by + the memset expander. */ + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), + /*nul_terminated=*/false); } /* LEN specify length of the block of memcpy/memset operation. @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) rtx builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + fixed_size_mode mode) { const char *str = (const char *) data; if ((unsigned HOST_WIDE_INT) offset > strlen (str)) return const0_rtx; - return c_readstr (str + offset, mode); + /* NB: Vector mode in the by-pieces infrastructure is only used by + the memset expander. */ + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); } /* Helper to check the sizes of sequences and the destination of calls @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) return NULL_RTX; } -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) - bytes from constant string DATA + OFFSET and return it as target - constant. If PREV isn't nullptr, it has the RTL info from the +/* Return the RTL of a register in MODE generated from PREV in the previous iteration. */ -rtx -builtin_memset_read_str (void *data, void *prevp, - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) +static rtx +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) { - by_pieces_prev *prev = (by_pieces_prev *) prevp; + rtx target = nullptr; if (prev != nullptr && prev->data != nullptr) { /* Use the previous data in the same mode. */ if (prev->mode == mode) return prev->data; + + fixed_size_mode prev_mode = prev->mode; + + /* Don't use the previous data to write QImode if it is in a + vector mode. */ + if (VECTOR_MODE_P (prev_mode) && mode == QImode) + return target; + + rtx prev_rtx = prev->data; + + if (REG_P (prev_rtx) + && HARD_REGISTER_P (prev_rtx) + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) + { + /* If we can't put a hard register in MODE, first generate a + subreg of word mode if the previous mode is wider than word + mode and word mode is wider than MODE. */ + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) + { + prev_rtx = lowpart_subreg (word_mode, prev_rtx, + prev_mode); + if (prev_rtx != nullptr) + prev_mode = word_mode; + } + else + prev_rtx = nullptr; + } + if (prev_rtx != nullptr) + target = lowpart_subreg (mode, prev_rtx, prev_mode); } + return target; +} + +/* Return the RTL of a register in MODE broadcasted from DATA. */ + +static rtx +gen_memset_broadcast (rtx data, fixed_size_mode mode) +{ + /* Skip if MODE is not a vector mode. */ + if (!VECTOR_MODE_P (mode)) + return nullptr; + + gcc_assert (GET_MODE_INNER (mode) == QImode); + + enum insn_code icode = optab_handler (vec_duplicate_optab, mode); + /* NB: vec_duplicate_optab is a precondition to pick a vector mode + for the memset expander. */ + gcc_assert (icode != CODE_FOR_nothing); + + rtx target = targetm.gen_memset_scratch_rtx (mode); + if (CONST_INT_P (data)) + { + /* Use the move expander with CONST_VECTOR. */ + rtx const_vec = gen_const_vec_duplicate (mode, data); + emit_move_insn (target, const_vec); + } + else + { + class expand_operand ops[2]; + create_output_operand (&ops[0], target, mode); + create_input_operand (&ops[1], data, QImode); + expand_insn (icode, 2, ops); + if (!rtx_equal_p (target, ops[0].value)) + emit_move_insn (target, ops[0].value); + } + + return target; +} + +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) + bytes from constant string DATA + OFFSET and return it as target + constant. If PREV isn't nullptr, it has the RTL info from the + previous iteration. */ + +rtx +builtin_memset_read_str (void *data, void *prev, + HOST_WIDE_INT offset ATTRIBUTE_UNUSED, + fixed_size_mode mode) +{ const char *c = (const char *) data; - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode)); + unsigned int size = GET_MODE_SIZE (mode); + + rtx target = gen_memset_value_from_prev ((by_pieces_prev *) prev, + mode); + if (target != nullptr) + return target; + rtx src = gen_int_mode (*c, QImode); + target = gen_memset_broadcast (src, mode); + if (target != nullptr) + return target; + + char *p = XALLOCAVEC (char, size); - memset (p, *c, GET_MODE_SIZE (mode)); + memset (p, *c, size); - return c_readstr (p, mode); + /* NB: Vector mode should be handled by gen_memset_broadcast above. */ + return c_readstr (p, as_a <scalar_int_mode> (mode)); } /* Callback routine for store_by_pieces. Return the RTL of a register @@ -6719,33 +6811,29 @@ builtin_memset_read_str (void *data, void *prevp, nullptr, it has the RTL info from the previous iteration. */ static rtx -builtin_memset_gen_str (void *data, void *prevp, +builtin_memset_gen_str (void *data, void *prev, HOST_WIDE_INT offset ATTRIBUTE_UNUSED, - scalar_int_mode mode) + fixed_size_mode mode) { rtx target, coeff; size_t size; char *p; - by_pieces_prev *prev = (by_pieces_prev *) prevp; - if (prev != nullptr && prev->data != nullptr) - { - /* Use the previous data in the same mode. */ - if (prev->mode == mode) - return prev->data; - - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0); - if (target != nullptr) - return target; - } - size = GET_MODE_SIZE (mode); if (size == 1) return (rtx) data; + target = gen_memset_value_from_prev ((by_pieces_prev *) prev, mode); + if (target != nullptr) + return target; + + target = gen_memset_broadcast ((rtx) data, mode); + if (target != nullptr) + return target; + p = XALLOCAVEC (char, size); memset (p, 1, size); - coeff = c_readstr (p, mode); + coeff = c_readstr (p, as_a <scalar_int_mode> (mode)); target = convert_to_mode (mode, (rtx) data, 1); target = expand_mult (mode, target, coeff, NULL_RTX, 1); @@ -6849,7 +6937,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len, &valc, align, true)) return false; - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode); + by_pieces_constfn constfun; void *constfundata; if (val) { diff --git a/gcc/builtins.h b/gcc/builtins.h index a64ece3f1cd..4b2ad766c61 100644 --- a/gcc/builtins.h +++ b/gcc/builtins.h @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn); extern tree mathfn_built_in (tree, combined_fn); extern tree mathfn_built_in_type (combined_fn); extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + fixed_size_mode); extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + fixed_size_mode); extern rtx expand_builtin_saveregs (void); extern tree std_build_builtin_va_list (void); extern tree std_fn_abi_va_list (tree); diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index c8f4abe3e41..14d387750a8 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -12149,6 +12149,13 @@ This function prepares to emit a conditional comparison within a sequence @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode}) +This hook should return an rtx for scratch register in @var{mode} to +be used by memset broadcast. The backend can use a hard scratch register +to avoid stack realignment when expanding memset. The default is +@code{gen_reg_rtx}. +@end deftypefn + @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop}) This target hook returns a new value for the number of times @var{loop} should be unrolled. The parameter @var{nunroll} is the number of times diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 9c4b5016053..923b1009d09 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -7984,6 +7984,8 @@ lists. @hook TARGET_GEN_CCMP_NEXT +@hook TARGET_GEN_MEMSET_SCRATCH_RTX + @hook TARGET_LOOP_UNROLL_ADJUST @defmac POWI_MAX_MULTS diff --git a/gcc/expr.c b/gcc/expr.c index 6a4368113c4..c8e27ed77af 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align) return align; } -/* Return the widest integer mode that is narrower than SIZE bytes. */ +/* Return the widest QI vector, if QI_MODE is true, or integer mode + that is narrower than SIZE bytes. */ -static scalar_int_mode -widest_int_mode_for_size (unsigned int size) +static fixed_size_mode +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector) { - scalar_int_mode result = NARROWEST_INT_MODE; + machine_mode result = NARROWEST_INT_MODE; gcc_checking_assert (size > 1); + /* Use QI vector only if size is wider than a WORD. */ + if (qi_vector && size > UNITS_PER_WORD) + { + machine_mode mode; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (GET_MODE_INNER (mode) == QImode + && GET_MODE_SIZE (mode).is_constant ()) + { + if (GET_MODE_SIZE (mode).to_constant () >= size) + break; + if (optab_handler (vec_duplicate_optab, mode) + != CODE_FOR_nothing) + result = mode; + } + + if (result != NARROWEST_INT_MODE) + return as_a <fixed_size_mode> (result); + } + opt_scalar_int_mode tmode; FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT) if (GET_MODE_SIZE (tmode.require ()) < size) result = tmode.require (); - return result; + return as_a <fixed_size_mode> (result); } /* Determine whether an operation OP on LEN bytes with alignment ALIGN can @@ -815,13 +835,14 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, unsigned int max_size, by_pieces_operation op) { unsigned HOST_WIDE_INT n_insns = 0; - scalar_int_mode mode; + fixed_size_mode mode; if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES) { /* NB: Round up L and ALIGN to the widest integer mode for MAX_SIZE. */ - mode = widest_int_mode_for_size (max_size); + mode = widest_fixed_size_mode_for_size (max_size, + op == SET_BY_PIECES); if (optab_handler (mov_optab, mode) != CODE_FOR_nothing) { unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode)); @@ -835,7 +856,8 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align, while (max_size > 1 && l > 0) { - mode = widest_int_mode_for_size (max_size); + mode = widest_fixed_size_mode_for_size (max_size, + op == SET_BY_PIECES); enum insn_code icode; unsigned int modesize = GET_MODE_SIZE (mode); @@ -903,8 +925,7 @@ class pieces_addr void *m_cfndata; public: pieces_addr (rtx, bool, by_pieces_constfn, void *); - rtx adjust (scalar_int_mode, HOST_WIDE_INT, - by_pieces_prev * = nullptr); + rtx adjust (fixed_size_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr); void increment_address (HOST_WIDE_INT); void maybe_predec (HOST_WIDE_INT); void maybe_postinc (HOST_WIDE_INT); @@ -1006,7 +1027,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse, but we still modify the MEM's properties. */ rtx -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset, +pieces_addr::adjust (fixed_size_mode mode, HOST_WIDE_INT offset, by_pieces_prev *prev) { if (m_constfn) @@ -1060,11 +1081,14 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size) class op_by_pieces_d { private: - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int); + fixed_size_mode get_usable_mode (fixed_size_mode, unsigned int); + fixed_size_mode smallest_fixed_size_mode_for_size (unsigned int); protected: pieces_addr m_to, m_from; - unsigned HOST_WIDE_INT m_len; + /* NB: Make m_len read-only so that smallest_fixed_size_mode_for_size + can use it to check the valid mode size. */ + const unsigned HOST_WIDE_INT m_len; HOST_WIDE_INT m_offset; unsigned int m_align; unsigned int m_max_size; @@ -1073,6 +1097,8 @@ class op_by_pieces_d bool m_push; /* True if targetm.overlap_op_by_pieces_p () returns true. */ bool m_overlap_op_by_pieces; + /* True if QI vector mode can be used. */ + bool m_qi_vector_mode; /* Virtual functions, overriden by derived classes for the specific operation. */ @@ -1084,7 +1110,8 @@ class op_by_pieces_d public: op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool); + unsigned HOST_WIDE_INT, unsigned int, bool, + bool = false); void run (); }; @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, - unsigned int align, bool push) + unsigned int align, bool push, + bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), m_len (len), m_max_size (MOVE_MAX_PIECES + 1), - m_push (push) + m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); int fromi = m_from.get_addr_inc (); @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2) { /* Find the mode of the largest comparison. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); + fixed_size_mode mode + = widest_fixed_size_mode_for_size (m_max_size, + m_qi_vector_mode); m_from.decide_autoinc (mode, m_reverse, len); m_to.decide_autoinc (mode, m_reverse, len); @@ -1139,8 +1169,8 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, /* This function returns the largest usable integer mode for LEN bytes whose size is no bigger than size of MODE. */ -scalar_int_mode -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) +fixed_size_mode +op_by_pieces_d::get_usable_mode (fixed_size_mode mode, unsigned int len) { unsigned int size; do @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len) size = GET_MODE_SIZE (mode); if (len >= size && prepare_mode (mode, m_align)) break; - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); + /* NB: widest_fixed_size_mode_for_size checks SIZE > 1. */ + mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode); } while (1); return mode; } +/* Return the smallest integer or QI vector mode that is not narrower + than SIZE bytes. */ + +fixed_size_mode +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size) +{ + /* Use QI vector only for > size of WORD. */ + if (m_qi_vector_mode && size > UNITS_PER_WORD) + { + machine_mode mode; + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT) + if (GET_MODE_INNER (mode) == QImode + && GET_MODE_SIZE (mode).is_constant ()) + { + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); + + /* NB: Don't return a mode wider than M_LEN. */ + if (mode_size > m_len) + break; + + if (mode_size >= size + && (optab_handler (vec_duplicate_optab, mode) + != CODE_FOR_nothing)) + return as_a <fixed_size_mode> (mode); + } + } + + return smallest_int_mode_for_size (size * BITS_PER_UNIT); +} + /* This function contains the main loop used for expanding a block operation. First move what we can in the largest integer mode, then go to successively smaller modes. For every access, call @@ -1166,9 +1226,12 @@ op_by_pieces_d::run () if (m_len == 0) return; - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */ - scalar_int_mode mode = widest_int_mode_for_size (m_max_size); - mode = get_usable_mode (mode, m_len); + unsigned HOST_WIDE_INT length = m_len; + + /* NB: widest_fixed_size_mode_for_size checks M_MAX_SIZE > 1. */ + fixed_size_mode mode + = widest_fixed_size_mode_for_size (m_max_size, m_qi_vector_mode); + mode = get_usable_mode (mode, length); by_pieces_prev to_prev = { nullptr, mode }; by_pieces_prev from_prev = { nullptr, mode }; @@ -1178,7 +1241,7 @@ op_by_pieces_d::run () unsigned int size = GET_MODE_SIZE (mode); rtx to1 = NULL_RTX, from1; - while (m_len >= size) + while (length >= size) { if (m_reverse) m_offset -= size; @@ -1201,22 +1264,22 @@ op_by_pieces_d::run () if (!m_reverse) m_offset += size; - m_len -= size; + length -= size; } finish_mode (mode); - if (m_len == 0) + if (length == 0) return; if (!m_push && m_overlap_op_by_pieces) { /* NB: Generate overlapping operations if it is not a stack push since stack push must not overlap. Get the smallest - integer mode for M_LEN bytes. */ - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT); + fixed size mode for M_LEN bytes. */ + mode = smallest_fixed_size_mode_for_size (length); mode = get_usable_mode (mode, GET_MODE_SIZE (mode)); - int gap = GET_MODE_SIZE (mode) - m_len; + int gap = GET_MODE_SIZE (mode) - length; if (gap > 0) { /* If size of MODE > M_LEN, generate the last operation @@ -1226,20 +1289,21 @@ op_by_pieces_d::run () m_offset += gap; else m_offset -= gap; - m_len += gap; + length += gap; } } else { - /* NB: widest_int_mode_for_size checks SIZE > 1. */ - mode = widest_int_mode_for_size (size); - mode = get_usable_mode (mode, m_len); + /* NB: widest_fixed_size_mode_for_size checks SIZE > 1. */ + mode = widest_fixed_size_mode_for_size (size, + m_qi_vector_mode); + mode = get_usable_mode (mode, length); } } while (1); /* The code above should have handled everything. */ - gcc_assert (!m_len); + gcc_assert (!length); } /* Derived class from op_by_pieces_d, providing support for block move @@ -1355,9 +1419,10 @@ class store_by_pieces_d : public op_by_pieces_d public: store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, - unsigned HOST_WIDE_INT len, unsigned int align) + unsigned HOST_WIDE_INT len, unsigned int align, + bool qi_vector_mode) : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false) + align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1446,7 +1511,10 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len, max_size = STORE_MAX_PIECES + 1; while (max_size > 1 && l > 0) { - scalar_int_mode mode = widest_int_mode_for_size (max_size); + /* NB: Since this can be called before virtual registers are + ready to use, avoid QI vector mode here. */ + fixed_size_mode mode + = widest_fixed_size_mode_for_size (max_size, false); icode = optab_handler (mov_optab, mode); if (icode != CODE_FOR_nothing @@ -1504,7 +1572,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, memsetp ? SET_BY_PIECES : STORE_BY_PIECES, optimize_insn_for_speed_p ())); - store_by_pieces_d data (to, constfun, constfundata, len, align); + store_by_pieces_d data (to, constfun, constfundata, len, align, + memsetp); data.run (); if (retmode != RETURN_BEGIN) @@ -1513,15 +1582,6 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len, return to; } -/* Callback routine for clear_by_pieces. - Return const0_rtx unconditionally. */ - -static rtx -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode) -{ - return const0_rtx; -} - /* Generate several move instructions to clear LEN bytes of block TO. (A MEM rtx with BLKmode). ALIGN is maximum alignment we can assume. */ @@ -1531,7 +1591,10 @@ clear_by_pieces (rtx to, unsigned HOST_WIDE_INT len, unsigned int align) if (len == 0) return; - store_by_pieces_d data (to, clear_by_pieces_1, NULL, len, align); + /* NB: Use builtin_memset_read_str to support vector mode broadcast. */ + char c = 0; + store_by_pieces_d data (to, builtin_memset_read_str, &c, len, align, + true); data.run (); } @@ -5754,7 +5817,7 @@ emit_storent_insn (rtx to, rtx from) static rtx string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, - scalar_int_mode mode) + fixed_size_mode mode) { tree str = (tree) data; @@ -5769,10 +5832,13 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset, size_t l = TREE_STRING_LENGTH (str) - offset; memcpy (p, TREE_STRING_POINTER (str) + offset, l); memset (p + l, '\0', GET_MODE_SIZE (mode) - l); - return c_readstr (p, mode, false); + return c_readstr (p, as_a <scalar_int_mode> (mode), false); } - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false); + /* NB: Vector mode in the by-pieces infrastructure is only used by + the memset expander. */ + return c_readstr (TREE_STRING_POINTER (str) + offset, + as_a <scalar_int_mode> (mode), false); } /* Generate code for computing expression EXP, diff --git a/gcc/expr.h b/gcc/expr.h index a4f44265759..94a85b40dca 100644 --- a/gcc/expr.h +++ b/gcc/expr.h @@ -108,13 +108,13 @@ enum block_op_methods }; typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT, - scalar_int_mode); + fixed_size_mode); /* The second pointer passed to by_pieces_constfn. */ struct by_pieces_prev { rtx data; - scalar_int_mode mode; + fixed_size_mode mode; }; extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods); diff --git a/gcc/rtl.h b/gcc/rtl.h index 2dbc4339469..e0766ffb4ad 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -2460,6 +2460,8 @@ extern bool subreg_offset_representable_p (unsigned int, machine_mode, extern unsigned int subreg_regno (const_rtx); extern int simplify_subreg_regno (unsigned int, machine_mode, poly_uint64, machine_mode); +extern int lowpart_subreg_regno (unsigned int, machine_mode, + machine_mode); extern unsigned int subreg_nregs (const_rtx); extern unsigned int subreg_nregs_with_regno (unsigned int, const_rtx); extern unsigned HOST_WIDE_INT nonzero_bits (const_rtx, machine_mode); diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index ec7a062829c..3b8d88afd4d 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -4325,6 +4325,17 @@ simplify_subreg_regno (unsigned int xregno, machine_mode xmode, return (int) yregno; } +/* A wrapper around simplify_subreg_regno that uses subreg_lowpart_offset + (xmode, ymode) as the offset. */ + +int +lowpart_subreg_regno (unsigned int regno, machine_mode xmode, + machine_mode ymode) +{ + poly_uint64 offset = subreg_lowpart_offset (xmode, ymode); + return simplify_subreg_regno (regno, xmode, offset, ymode); +} + /* Return the final regno that a subreg expression refers to. */ unsigned int subreg_regno (const_rtx x) diff --git a/gcc/target.def b/gcc/target.def index 2e40448e6c5..b801289cebd 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -2726,6 +2726,15 @@ DEFHOOK rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code), NULL) +DEFHOOK +(gen_memset_scratch_rtx, + "This hook should return an rtx for scratch register in @var{mode} to\n\ +be used by memset broadcast. The backend can use a hard scratch register\n\ +to avoid stack realignment when expanding memset. The default is\n\ +@code{gen_reg_rtx}.", + rtx, (machine_mode mode), + gen_reg_rtx) + /* Return a new value for loop unroll size. */ DEFHOOK (loop_unroll_adjust, diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c index b6dbcf7809b..007e79f91b0 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c @@ -10,6 +10,6 @@ foo (void) } /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */ /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c index f41e6147b4c..1e50dc842bc 100644 --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c @@ -4,6 +4,6 @@ #include "pr100865-4a.c" /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */ -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */ +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */ /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */ /* { dg-final { scan-assembler-not "vmovdqa" } } */