diff mbox series

[v5,2/2] x86: Add vec_duplicate<mode> expander

Message ID 20210626200223.1371358-3-hjl.tools@gmail.com
State New
Headers show
Series x86: Convert CONST_WIDE_INT/CONST_VECTOR to broadcast | expand

Commit Message

H.J. Lu June 26, 2021, 8:02 p.m. UTC
1. Update vec_duplicate to allow to fail so that backend can only allow
broadcasting an integer constant to a vector when broadcast instruction
is available.  This can be used by memset expander to avoid vec_duplicate
when loading from constant pool is more efficient.
2. Add vec_duplicate<mode> expander and enable vec_duplicate from a
non-standard SSE constant integer only if vector broadcast is available.

	* config/i386/i386-expand.c (ix86_expand_integer_vec_duplicate):
	New function.
	* config/i386/i386-protos.h (ix86_expand_integer_vec_duplicat):
	New prototype.
	* config/i386/sse.md (INT_BROADCAST_MODE): New mode iterator.
	(vec_duplicate<mode>): New expander.
	* doc/md.texi: Update vec_duplicate.
---
 gcc/config/i386/i386-expand.c | 24 ++++++++++++++++++++++++
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/sse.md        | 28 ++++++++++++++++++++++++++++
 gcc/doc/md.texi               |  2 --
 4 files changed, 53 insertions(+), 2 deletions(-)

Comments

Richard Sandiford June 27, 2021, 8:43 a.m. UTC | #1
"H.J. Lu" <hjl.tools@gmail.com> writes:
> 1. Update vec_duplicate to allow to fail so that backend can only allow
> broadcasting an integer constant to a vector when broadcast instruction
> is available.  This can be used by memset expander to avoid vec_duplicate
> when loading from constant pool is more efficient.

I don't see any changes in target-independent code though, other than
the doc update.  It's still the case that (existing) uses of
vec_duplicate_optab do not allow it to fail.

Thanks,
Richard

> 2. Add vec_duplicate<mode> expander and enable vec_duplicate from a
> non-standard SSE constant integer only if vector broadcast is available.
>
> 	* config/i386/i386-expand.c (ix86_expand_integer_vec_duplicate):
> 	New function.
> 	* config/i386/i386-protos.h (ix86_expand_integer_vec_duplicat):
> 	New prototype.
> 	* config/i386/sse.md (INT_BROADCAST_MODE): New mode iterator.
> 	(vec_duplicate<mode>): New expander.
> 	* doc/md.texi: Update vec_duplicate.
> ---
>  gcc/config/i386/i386-expand.c | 24 ++++++++++++++++++++++++
>  gcc/config/i386/i386-protos.h |  1 +
>  gcc/config/i386/sse.md        | 28 ++++++++++++++++++++++++++++
>  gcc/doc/md.texi               |  2 --
>  4 files changed, 53 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> index e9e89c82764..75c160d4349 100644
> --- a/gcc/config/i386/i386-expand.c
> +++ b/gcc/config/i386/i386-expand.c
> @@ -15742,6 +15742,30 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
>      }
>  }
>  
> +/* Expand integer vec_duplicate.  Return true if successful.  */
> +
> +bool
> +ix86_expand_integer_vec_duplicate (rtx *operands)
> +{
> +  /* Enable VEC_DUPLICATE from a non-standard SSE constant integer only
> +     if vector broadcast is available.  */
> +  machine_mode mode = GET_MODE (operands[0]);
> +  if (CONST_INT_P (operands[1])
> +      && (!(TARGET_AVX2
> +	    || (TARGET_AVX
> +		&& (GET_MODE_INNER (mode) == SImode
> +		    || GET_MODE_INNER (mode) == DImode)))
> +	  || standard_sse_constant_p (operands[1], mode)))
> +    return false;
> +
> +  bool ok = ix86_expand_vector_init_duplicate (false, mode,
> +					       operands[0],
> +					       operands[1]);
> +  gcc_assert (ok);
> +
> +  return true;
> +}
> +
>  /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
>     to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
>     The upper bits of DEST are undefined, though they shouldn't cause
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 71745b9a1ea..a6cc09bb75b 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -258,6 +258,7 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
>  extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
> +extern bool ix86_expand_integer_vec_duplicate (rtx *);
>  
>  /* In i386-c.c  */
>  extern void ix86_target_macros (void);
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index e4f01e64bc1..53a703fb466 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -24640,3 +24640,31 @@ (define_insn "*aes<aeswideklvariant>u8"
>    "TARGET_WIDEKL"
>    "aes<aeswideklvariant>\t{%0}"
>    [(set_attr "type" "other")])
> +
> +;; Modes handled by broadcast patterns.  NB: Allow V64QI and V32HI with
> +;; TARGET_AVX512F since ix86_expand_integer_vec_duplicate can expand
> +;; without TARGET_AVX512BW which is used by memset vector broadcast
> +;; expander to XI with:
> +;; 	vmovd		%edi, %xmm15
> +;;	vpbroadcastb	%xmm15, %ymm15
> +;;	vinserti64x4	$0x1, %ymm15, %zmm15, %zmm15
> +
> +(define_mode_iterator INT_BROADCAST_MODE
> +  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
> +   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
> +   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
> +   (V8DI "TARGET_AVX512F && TARGET_64BIT")
> +   (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")])
> +
> +;; Broadcast from an integer.  NB: Enable broadcast only if we can move
> +;; from GPR to SSE register directly.
> +(define_expand "vec_duplicate<mode>"
> +  [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
> +	(vec_duplicate:INT_BROADCAST_MODE
> +	  (match_operand:<ssescalarmode> 1 "general_operand")))]
> +  "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
> +{
> +  if (!ix86_expand_integer_vec_duplicate (operands))
> +    FAIL;
> +  DONE;
> +})
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 1b918144330..a892c94d163 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5077,8 +5077,6 @@ the mode appropriate for one element of @var{m}.
>  This pattern only handles duplicates of non-constant inputs.  Constant
>  vectors go through the @code{mov@var{m}} pattern instead.
>  
> -This pattern is not allowed to @code{FAIL}.
> -
>  @cindex @code{vec_series@var{m}} instruction pattern
>  @item @samp{vec_series@var{m}}
>  Initialize vector output operand 0 so that element @var{i} is equal to
H.J. Lu June 27, 2021, 11:29 a.m. UTC | #2
On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu" <hjl.tools@gmail.com> writes:
> > 1. Update vec_duplicate to allow to fail so that backend can only allow
> > broadcasting an integer constant to a vector when broadcast instruction
> > is available.  This can be used by memset expander to avoid vec_duplicate
> > when loading from constant pool is more efficient.
>
> I don't see any changes in target-independent code though, other than
> the doc update.  It's still the case that (existing) uses of
> vec_duplicate_optab do not allow it to fail.

I have a followup patch set on

https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast

to use it to expand memset with vector broadcast:

https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3

For SSE2 which doesn't have vector broadcast, the constant vector broadcast
expander returns FAIL and load from constant pool will be used.

> Thanks,
> Richard
>
> > 2. Add vec_duplicate<mode> expander and enable vec_duplicate from a
> > non-standard SSE constant integer only if vector broadcast is available.
> >
> >       * config/i386/i386-expand.c (ix86_expand_integer_vec_duplicate):
> >       New function.
> >       * config/i386/i386-protos.h (ix86_expand_integer_vec_duplicat):
> >       New prototype.
> >       * config/i386/sse.md (INT_BROADCAST_MODE): New mode iterator.
> >       (vec_duplicate<mode>): New expander.
> >       * doc/md.texi: Update vec_duplicate.
> > ---
> >  gcc/config/i386/i386-expand.c | 24 ++++++++++++++++++++++++
> >  gcc/config/i386/i386-protos.h |  1 +
> >  gcc/config/i386/sse.md        | 28 ++++++++++++++++++++++++++++
> >  gcc/doc/md.texi               |  2 --
> >  4 files changed, 53 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
> > index e9e89c82764..75c160d4349 100644
> > --- a/gcc/config/i386/i386-expand.c
> > +++ b/gcc/config/i386/i386-expand.c
> > @@ -15742,6 +15742,30 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
> >      }
> >  }
> >
> > +/* Expand integer vec_duplicate.  Return true if successful.  */
> > +
> > +bool
> > +ix86_expand_integer_vec_duplicate (rtx *operands)
> > +{
> > +  /* Enable VEC_DUPLICATE from a non-standard SSE constant integer only
> > +     if vector broadcast is available.  */
> > +  machine_mode mode = GET_MODE (operands[0]);
> > +  if (CONST_INT_P (operands[1])
> > +      && (!(TARGET_AVX2
> > +         || (TARGET_AVX
> > +             && (GET_MODE_INNER (mode) == SImode
> > +                 || GET_MODE_INNER (mode) == DImode)))
> > +       || standard_sse_constant_p (operands[1], mode)))
> > +    return false;
> > +
> > +  bool ok = ix86_expand_vector_init_duplicate (false, mode,
> > +                                            operands[0],
> > +                                            operands[1]);
> > +  gcc_assert (ok);
> > +
> > +  return true;
> > +}
> > +
> >  /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
> >     to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
> >     The upper bits of DEST are undefined, though they shouldn't cause
> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > index 71745b9a1ea..a6cc09bb75b 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -258,6 +258,7 @@ extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
> >  extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
> >  extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
> >  extern void ix86_expand_sse2_abs (rtx, rtx);
> > +extern bool ix86_expand_integer_vec_duplicate (rtx *);
> >
> >  /* In i386-c.c  */
> >  extern void ix86_target_macros (void);
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index e4f01e64bc1..53a703fb466 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -24640,3 +24640,31 @@ (define_insn "*aes<aeswideklvariant>u8"
> >    "TARGET_WIDEKL"
> >    "aes<aeswideklvariant>\t{%0}"
> >    [(set_attr "type" "other")])
> > +
> > +;; Modes handled by broadcast patterns.  NB: Allow V64QI and V32HI with
> > +;; TARGET_AVX512F since ix86_expand_integer_vec_duplicate can expand
> > +;; without TARGET_AVX512BW which is used by memset vector broadcast
> > +;; expander to XI with:
> > +;;   vmovd           %edi, %xmm15
> > +;;   vpbroadcastb    %xmm15, %ymm15
> > +;;   vinserti64x4    $0x1, %ymm15, %zmm15, %zmm15
> > +
> > +(define_mode_iterator INT_BROADCAST_MODE
> > +  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
> > +   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
> > +   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
> > +   (V8DI "TARGET_AVX512F && TARGET_64BIT")
> > +   (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")])
> > +
> > +;; Broadcast from an integer.  NB: Enable broadcast only if we can move
> > +;; from GPR to SSE register directly.
> > +(define_expand "vec_duplicate<mode>"
> > +  [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
> > +     (vec_duplicate:INT_BROADCAST_MODE
> > +       (match_operand:<ssescalarmode> 1 "general_operand")))]
> > +  "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
> > +{
> > +  if (!ix86_expand_integer_vec_duplicate (operands))
> > +    FAIL;
> > +  DONE;
> > +})
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> > index 1b918144330..a892c94d163 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -5077,8 +5077,6 @@ the mode appropriate for one element of @var{m}.
> >  This pattern only handles duplicates of non-constant inputs.  Constant
> >  vectors go through the @code{mov@var{m}} pattern instead.
> >
> > -This pattern is not allowed to @code{FAIL}.
> > -
> >  @cindex @code{vec_series@var{m}} instruction pattern
> >  @item @samp{vec_series@var{m}}
> >  Initialize vector output operand 0 so that element @var{i} is equal to
Richard Sandiford June 27, 2021, 9 p.m. UTC | #3
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>> > 1. Update vec_duplicate to allow to fail so that backend can only allow
>> > broadcasting an integer constant to a vector when broadcast instruction
>> > is available.  This can be used by memset expander to avoid vec_duplicate
>> > when loading from constant pool is more efficient.
>>
>> I don't see any changes in target-independent code though, other than
>> the doc update.  It's still the case that (existing) uses of
>> vec_duplicate_optab do not allow it to fail.
>
> I have a followup patch set on
>
> https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast
>
> to use it to expand memset with vector broadcast:
>
> https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3
>
> For SSE2 which doesn't have vector broadcast, the constant vector broadcast
> expander returns FAIL and load from constant pool will be used.

Hmm, but as Jeff and I mentioned in the earlier replies,
vec_duplicate_optab shouldn't be used for constants.  Constants
should go via the move expanders instead.

In a previous message I suggested:

  … would it work to change:

	/* Try using vec_duplicate_optab for uniform vectors.  */
	if (!TREE_SIDE_EFFECTS (exp)
	    && VECTOR_MODE_P (mode)
	    && eltmode == GET_MODE_INNER (mode)
	    && ((icode = optab_handler (vec_duplicate_optab, mode))
		!= CODE_FOR_nothing)
	    && (elt = uniform_vector_p (exp)))

  to something like:

	/* Try using vec_duplicate_optab for uniform vectors.  */
	if (!TREE_SIDE_EFFECTS (exp)
	    && VECTOR_MODE_P (mode)
	    && eltmode == GET_MODE_INNER (mode)
	    && (elt = uniform_vector_p (exp)))
	  {
	    if (TREE_CODE (elt) == INTEGER_CST
		|| TREE_CODE (elt) == POLY_INT_CST
		|| TREE_CODE (elt) == REAL_CST
		|| TREE_CODE (elt) == FIXED_CST)
	      {
		rtx src = gen_const_vec_duplicate (mode, expand_normal (node));
		emit_move_insn (target, src);
		break;
	      }
	    …
	  }

if that code was the source of the constant operand.  If we're adding a
new use of vec_duplicate_optab then that should be similarly protected
against constant operands.

Thanks,
Richard
H.J. Lu June 28, 2021, 12:16 p.m. UTC | #4
On Sun, Jun 27, 2021 at 2:00 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> "H.J. Lu" <hjl.tools@gmail.com> writes:
> >> > 1. Update vec_duplicate to allow to fail so that backend can only allow
> >> > broadcasting an integer constant to a vector when broadcast instruction
> >> > is available.  This can be used by memset expander to avoid vec_duplicate
> >> > when loading from constant pool is more efficient.
> >>
> >> I don't see any changes in target-independent code though, other than
> >> the doc update.  It's still the case that (existing) uses of
> >> vec_duplicate_optab do not allow it to fail.
> >
> > I have a followup patch set on
> >
> > https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast
> >
> > to use it to expand memset with vector broadcast:
> >
> > https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3
> >
> > For SSE2 which doesn't have vector broadcast, the constant vector broadcast
> > expander returns FAIL and load from constant pool will be used.
>
> Hmm, but as Jeff and I mentioned in the earlier replies,
> vec_duplicate_optab shouldn't be used for constants.  Constants
> should go via the move expanders instead.
>
> In a previous message I suggested:
>
>   … would it work to change:
>
>         /* Try using vec_duplicate_optab for uniform vectors.  */
>         if (!TREE_SIDE_EFFECTS (exp)
>             && VECTOR_MODE_P (mode)
>             && eltmode == GET_MODE_INNER (mode)
>             && ((icode = optab_handler (vec_duplicate_optab, mode))
>                 != CODE_FOR_nothing)
>             && (elt = uniform_vector_p (exp)))
>
>   to something like:
>
>         /* Try using vec_duplicate_optab for uniform vectors.  */
>         if (!TREE_SIDE_EFFECTS (exp)
>             && VECTOR_MODE_P (mode)
>             && eltmode == GET_MODE_INNER (mode)
>             && (elt = uniform_vector_p (exp)))
>           {
>             if (TREE_CODE (elt) == INTEGER_CST
>                 || TREE_CODE (elt) == POLY_INT_CST
>                 || TREE_CODE (elt) == REAL_CST
>                 || TREE_CODE (elt) == FIXED_CST)
>               {
>                 rtx src = gen_const_vec_duplicate (mode, expand_normal (node));
>                 emit_move_insn (target, src);
>                 break;
>               }
>             …
>           }
>
> if that code was the source of the constant operand.  If we're adding a
> new use of vec_duplicate_optab then that should be similarly protected
> against constant operands.
>

Your comments apply to my initial vec_duplicate patch that caused the
gcc.dg/pr100239.c failure.  It has been fixed by

commit ffe3a37f54ab866d85bdde48c2a32be5e09d8515
Author: Richard Biener <rguenther@suse.de>
Date:   Mon Jun 7 20:08:13 2021 +0200

    middle-end/100951 - make sure to generate VECTOR_CST in lowering

    When vector lowering creates piecewise ops make sure to create
    VECTOR_CSTs instead of CONSTRUCTORs when possible.

The problem I am running into now is in my memset vector broadcast
patch.  In order to optimize vector broadcast for memset, I need to
generate a pseudo register for

 __builtin_memset (ops, 3, 38);

only when vector broadcast is available:

  rtx target = nullptr;

  unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
  machine_mode vector_mode;
  if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
    gcc_unreachable ();

  enum insn_code icode = optab_handler (vec_duplicate_optab,
                                        vector_mode);
  if (icode != CODE_FOR_nothing)
    {
      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
      class expand_operand ops[2];
      create_output_operand (&ops[0], reg, vector_mode);
      create_input_operand (&ops[1], data, QImode);
      if (maybe_expand_insn (icode, 2, ops))
        {
          if (!rtx_equal_p (reg, ops[0].value))
            emit_move_insn (reg, ops[0].value);
          target = lowpart_subreg (mode, reg, vector_mode);
        }
    }

  return target;  <<< Return nullptr to load from constant pool.
Richard Sandiford June 28, 2021, 12:36 p.m. UTC | #5
"H.J. Lu" <hjl.tools@gmail.com> writes:
> On Sun, Jun 27, 2021 at 2:00 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> "H.J. Lu" <hjl.tools@gmail.com> writes:
>> >> > 1. Update vec_duplicate to allow to fail so that backend can only allow
>> >> > broadcasting an integer constant to a vector when broadcast instruction
>> >> > is available.  This can be used by memset expander to avoid vec_duplicate
>> >> > when loading from constant pool is more efficient.
>> >>
>> >> I don't see any changes in target-independent code though, other than
>> >> the doc update.  It's still the case that (existing) uses of
>> >> vec_duplicate_optab do not allow it to fail.
>> >
>> > I have a followup patch set on
>> >
>> > https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast
>> >
>> > to use it to expand memset with vector broadcast:
>> >
>> > https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3
>> >
>> > For SSE2 which doesn't have vector broadcast, the constant vector broadcast
>> > expander returns FAIL and load from constant pool will be used.
>>
>> Hmm, but as Jeff and I mentioned in the earlier replies,
>> vec_duplicate_optab shouldn't be used for constants.  Constants
>> should go via the move expanders instead.
>>
>> In a previous message I suggested:
>>
>>   … would it work to change:
>>
>>         /* Try using vec_duplicate_optab for uniform vectors.  */
>>         if (!TREE_SIDE_EFFECTS (exp)
>>             && VECTOR_MODE_P (mode)
>>             && eltmode == GET_MODE_INNER (mode)
>>             && ((icode = optab_handler (vec_duplicate_optab, mode))
>>                 != CODE_FOR_nothing)
>>             && (elt = uniform_vector_p (exp)))
>>
>>   to something like:
>>
>>         /* Try using vec_duplicate_optab for uniform vectors.  */
>>         if (!TREE_SIDE_EFFECTS (exp)
>>             && VECTOR_MODE_P (mode)
>>             && eltmode == GET_MODE_INNER (mode)
>>             && (elt = uniform_vector_p (exp)))
>>           {
>>             if (TREE_CODE (elt) == INTEGER_CST
>>                 || TREE_CODE (elt) == POLY_INT_CST
>>                 || TREE_CODE (elt) == REAL_CST
>>                 || TREE_CODE (elt) == FIXED_CST)
>>               {
>>                 rtx src = gen_const_vec_duplicate (mode, expand_normal (node));
>>                 emit_move_insn (target, src);
>>                 break;
>>               }
>>             …
>>           }
>>
>> if that code was the source of the constant operand.  If we're adding a
>> new use of vec_duplicate_optab then that should be similarly protected
>> against constant operands.
>>
>
> Your comments apply to my initial vec_duplicate patch that caused the
> gcc.dg/pr100239.c failure.  It has been fixed by
>
> commit ffe3a37f54ab866d85bdde48c2a32be5e09d8515
> Author: Richard Biener <rguenther@suse.de>
> Date:   Mon Jun 7 20:08:13 2021 +0200
>
>     middle-end/100951 - make sure to generate VECTOR_CST in lowering
>
>     When vector lowering creates piecewise ops make sure to create
>     VECTOR_CSTs instead of CONSTRUCTORs when possible.
>
> The problem I am running into now is in my memset vector broadcast
> patch.  In order to optimize vector broadcast for memset, I need to
> generate a pseudo register for
>
>  __builtin_memset (ops, 3, 38);
>
> only when vector broadcast is available:
>
>   rtx target = nullptr;
>
>   unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
>   machine_mode vector_mode;
>   if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
>     gcc_unreachable ();
>
>   enum insn_code icode = optab_handler (vec_duplicate_optab,
>                                         vector_mode);
>   if (icode != CODE_FOR_nothing)
>     {
>       rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
>       class expand_operand ops[2];
>       create_output_operand (&ops[0], reg, vector_mode);
>       create_input_operand (&ops[1], data, QImode);
>       if (maybe_expand_insn (icode, 2, ops))
>         {
>           if (!rtx_equal_p (reg, ops[0].value))
>             emit_move_insn (reg, ops[0].value);
>           target = lowpart_subreg (mode, reg, vector_mode);
>         }
>     }
>
>   return target;  <<< Return nullptr to load from constant pool.

I don't think this is a correct use of vec_duplicate_optab.  If the
scalar operand is a constant then the move should always go through
the move expanders instead, as a move from a CONST_VECTOR.

Thanks,
Richard
H.J. Lu June 28, 2021, 7:38 p.m. UTC | #6
On Mon, Jun 28, 2021 at 5:36 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu" <hjl.tools@gmail.com> writes:
> > On Sun, Jun 27, 2021 at 2:00 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
> >> > <richard.sandiford@arm.com> wrote:
> >> >>
> >> >> "H.J. Lu" <hjl.tools@gmail.com> writes:
> >> >> > 1. Update vec_duplicate to allow to fail so that backend can only allow
> >> >> > broadcasting an integer constant to a vector when broadcast instruction
> >> >> > is available.  This can be used by memset expander to avoid vec_duplicate
> >> >> > when loading from constant pool is more efficient.
> >> >>
> >> >> I don't see any changes in target-independent code though, other than
> >> >> the doc update.  It's still the case that (existing) uses of
> >> >> vec_duplicate_optab do not allow it to fail.
> >> >
> >> > I have a followup patch set on
> >> >
> >> > https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast
> >> >
> >> > to use it to expand memset with vector broadcast:
> >> >
> >> > https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3
> >> >
> >> > For SSE2 which doesn't have vector broadcast, the constant vector broadcast
> >> > expander returns FAIL and load from constant pool will be used.
> >>
> >> Hmm, but as Jeff and I mentioned in the earlier replies,
> >> vec_duplicate_optab shouldn't be used for constants.  Constants
> >> should go via the move expanders instead.
> >>
> >> In a previous message I suggested:
> >>
> >>   … would it work to change:
> >>
> >>         /* Try using vec_duplicate_optab for uniform vectors.  */
> >>         if (!TREE_SIDE_EFFECTS (exp)
> >>             && VECTOR_MODE_P (mode)
> >>             && eltmode == GET_MODE_INNER (mode)
> >>             && ((icode = optab_handler (vec_duplicate_optab, mode))
> >>                 != CODE_FOR_nothing)
> >>             && (elt = uniform_vector_p (exp)))
> >>
> >>   to something like:
> >>
> >>         /* Try using vec_duplicate_optab for uniform vectors.  */
> >>         if (!TREE_SIDE_EFFECTS (exp)
> >>             && VECTOR_MODE_P (mode)
> >>             && eltmode == GET_MODE_INNER (mode)
> >>             && (elt = uniform_vector_p (exp)))
> >>           {
> >>             if (TREE_CODE (elt) == INTEGER_CST
> >>                 || TREE_CODE (elt) == POLY_INT_CST
> >>                 || TREE_CODE (elt) == REAL_CST
> >>                 || TREE_CODE (elt) == FIXED_CST)
> >>               {
> >>                 rtx src = gen_const_vec_duplicate (mode, expand_normal (node));
> >>                 emit_move_insn (target, src);
> >>                 break;
> >>               }
> >>             …
> >>           }
> >>
> >> if that code was the source of the constant operand.  If we're adding a
> >> new use of vec_duplicate_optab then that should be similarly protected
> >> against constant operands.
> >>
> >
> > Your comments apply to my initial vec_duplicate patch that caused the
> > gcc.dg/pr100239.c failure.  It has been fixed by
> >
> > commit ffe3a37f54ab866d85bdde48c2a32be5e09d8515
> > Author: Richard Biener <rguenther@suse.de>
> > Date:   Mon Jun 7 20:08:13 2021 +0200
> >
> >     middle-end/100951 - make sure to generate VECTOR_CST in lowering
> >
> >     When vector lowering creates piecewise ops make sure to create
> >     VECTOR_CSTs instead of CONSTRUCTORs when possible.
> >
> > The problem I am running into now is in my memset vector broadcast
> > patch.  In order to optimize vector broadcast for memset, I need to
> > generate a pseudo register for
> >
> >  __builtin_memset (ops, 3, 38);
> >
> > only when vector broadcast is available:
> >
> >   rtx target = nullptr;
> >
> >   unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> >   machine_mode vector_mode;
> >   if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> >     gcc_unreachable ();
> >
> >   enum insn_code icode = optab_handler (vec_duplicate_optab,
> >                                         vector_mode);
> >   if (icode != CODE_FOR_nothing)
> >     {
> >       rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> >       class expand_operand ops[2];
> >       create_output_operand (&ops[0], reg, vector_mode);
> >       create_input_operand (&ops[1], data, QImode);
> >       if (maybe_expand_insn (icode, 2, ops))
> >         {
> >           if (!rtx_equal_p (reg, ops[0].value))
> >             emit_move_insn (reg, ops[0].value);
> >           target = lowpart_subreg (mode, reg, vector_mode);
> >         }
> >     }
> >
> >   return target;  <<< Return nullptr to load from constant pool.
>
> I don't think this is a correct use of vec_duplicate_optab.  If the
> scalar operand is a constant then the move should always go through
> the move expanders instead, as a move from a CONST_VECTOR.

Like this?

  enum insn_code icode = optab_handler (vec_duplicate_optab,
                                        vector_mode);
  if (icode != CODE_FOR_nothing)
    {
      rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
      if (CONST_INT_P (data))
        {
          /* Use the move expander with CONST_VECTOR.  */
          rtvec v = rtvec_alloc (nunits);
          for (unsigned int i = 0; i < nunits; i++)
            RTVEC_ELT (v, i) = data;
          rtx const_vec = gen_rtx_CONST_VECTOR (vector_mode, v);
          emit_move_insn (reg, const_vec);
        }
      else
        {

          class expand_operand ops[2];
          create_output_operand (&ops[0], reg, vector_mode);
          create_input_operand (&ops[1], data, QImode);
          expand_insn (icode, 2, ops);
          if (!rtx_equal_p (reg, ops[0].value))
            emit_move_insn (reg, ops[0].value);
        }
      target = lowpart_subreg (mode, reg, vector_mode);
    }
Richard Sandiford June 29, 2021, 8:17 a.m. UTC | #7
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Mon, Jun 28, 2021 at 5:36 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>> > On Sun, Jun 27, 2021 at 2:00 PM Richard Sandiford
>> > <richard.sandiford@arm.com> wrote:
>> >>
>> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> >> > On Sun, Jun 27, 2021 at 1:43 AM Richard Sandiford
>> >> > <richard.sandiford@arm.com> wrote:
>> >> >>
>> >> >> "H.J. Lu" <hjl.tools@gmail.com> writes:
>> >> >> > 1. Update vec_duplicate to allow to fail so that backend can only allow
>> >> >> > broadcasting an integer constant to a vector when broadcast instruction
>> >> >> > is available.  This can be used by memset expander to avoid vec_duplicate
>> >> >> > when loading from constant pool is more efficient.
>> >> >>
>> >> >> I don't see any changes in target-independent code though, other than
>> >> >> the doc update.  It's still the case that (existing) uses of
>> >> >> vec_duplicate_optab do not allow it to fail.
>> >> >
>> >> > I have a followup patch set on
>> >> >
>> >> > https://gitlab.com/x86-gcc/gcc/-/commits/users/hjl/pieces/broadcast
>> >> >
>> >> > to use it to expand memset with vector broadcast:
>> >> >
>> >> > https://gitlab.com/x86-gcc/gcc/-/commit/991c87f8a83ca736ae9ed92baa3ebadca289f6e3
>> >> >
>> >> > For SSE2 which doesn't have vector broadcast, the constant vector broadcast
>> >> > expander returns FAIL and load from constant pool will be used.
>> >>
>> >> Hmm, but as Jeff and I mentioned in the earlier replies,
>> >> vec_duplicate_optab shouldn't be used for constants.  Constants
>> >> should go via the move expanders instead.
>> >>
>> >> In a previous message I suggested:
>> >>
>> >>   … would it work to change:
>> >>
>> >>         /* Try using vec_duplicate_optab for uniform vectors.  */
>> >>         if (!TREE_SIDE_EFFECTS (exp)
>> >>             && VECTOR_MODE_P (mode)
>> >>             && eltmode == GET_MODE_INNER (mode)
>> >>             && ((icode = optab_handler (vec_duplicate_optab, mode))
>> >>                 != CODE_FOR_nothing)
>> >>             && (elt = uniform_vector_p (exp)))
>> >>
>> >>   to something like:
>> >>
>> >>         /* Try using vec_duplicate_optab for uniform vectors.  */
>> >>         if (!TREE_SIDE_EFFECTS (exp)
>> >>             && VECTOR_MODE_P (mode)
>> >>             && eltmode == GET_MODE_INNER (mode)
>> >>             && (elt = uniform_vector_p (exp)))
>> >>           {
>> >>             if (TREE_CODE (elt) == INTEGER_CST
>> >>                 || TREE_CODE (elt) == POLY_INT_CST
>> >>                 || TREE_CODE (elt) == REAL_CST
>> >>                 || TREE_CODE (elt) == FIXED_CST)
>> >>               {
>> >>                 rtx src = gen_const_vec_duplicate (mode, expand_normal (node));
>> >>                 emit_move_insn (target, src);
>> >>                 break;
>> >>               }
>> >>             …
>> >>           }
>> >>
>> >> if that code was the source of the constant operand.  If we're adding a
>> >> new use of vec_duplicate_optab then that should be similarly protected
>> >> against constant operands.
>> >>
>> >
>> > Your comments apply to my initial vec_duplicate patch that caused the
>> > gcc.dg/pr100239.c failure.  It has been fixed by
>> >
>> > commit ffe3a37f54ab866d85bdde48c2a32be5e09d8515
>> > Author: Richard Biener <rguenther@suse.de>
>> > Date:   Mon Jun 7 20:08:13 2021 +0200
>> >
>> >     middle-end/100951 - make sure to generate VECTOR_CST in lowering
>> >
>> >     When vector lowering creates piecewise ops make sure to create
>> >     VECTOR_CSTs instead of CONSTRUCTORs when possible.
>> >
>> > The problem I am running into now is in my memset vector broadcast
>> > patch.  In order to optimize vector broadcast for memset, I need to
>> > generate a pseudo register for
>> >
>> >  __builtin_memset (ops, 3, 38);
>> >
>> > only when vector broadcast is available:
>> >
>> >   rtx target = nullptr;
>> >
>> >   unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
>> >   machine_mode vector_mode;
>> >   if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
>> >     gcc_unreachable ();
>> >
>> >   enum insn_code icode = optab_handler (vec_duplicate_optab,
>> >                                         vector_mode);
>> >   if (icode != CODE_FOR_nothing)
>> >     {
>> >       rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
>> >       class expand_operand ops[2];
>> >       create_output_operand (&ops[0], reg, vector_mode);
>> >       create_input_operand (&ops[1], data, QImode);
>> >       if (maybe_expand_insn (icode, 2, ops))
>> >         {
>> >           if (!rtx_equal_p (reg, ops[0].value))
>> >             emit_move_insn (reg, ops[0].value);
>> >           target = lowpart_subreg (mode, reg, vector_mode);
>> >         }
>> >     }
>> >
>> >   return target;  <<< Return nullptr to load from constant pool.
>>
>> I don't think this is a correct use of vec_duplicate_optab.  If the
>> scalar operand is a constant then the move should always go through
>> the move expanders instead, as a move from a CONST_VECTOR.
>
> Like this?
>
>   enum insn_code icode = optab_handler (vec_duplicate_optab,
>                                         vector_mode);
>   if (icode != CODE_FOR_nothing)
>     {
>       rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
>       if (CONST_INT_P (data))
>         {
>           /* Use the move expander with CONST_VECTOR.  */
>           rtvec v = rtvec_alloc (nunits);
>           for (unsigned int i = 0; i < nunits; i++)
>             RTVEC_ELT (v, i) = data;
>           rtx const_vec = gen_rtx_CONST_VECTOR (vector_mode, v);

FWIW, this is:

        rtx const_vec = gen_const_vec_duplicate (vector_mode, data);

>           emit_move_insn (reg, const_vec);
>         }
>       else
>         {
>
>           class expand_operand ops[2];
>           create_output_operand (&ops[0], reg, vector_mode);
>           create_input_operand (&ops[1], data, QImode);
>           expand_insn (icode, 2, ops);
>           if (!rtx_equal_p (reg, ops[0].value))
>             emit_move_insn (reg, ops[0].value);
>         }
>       target = lowpart_subreg (mode, reg, vector_mode);
>     }

I guess what I don't understand here is why we want to move a
“vector_mode” constant and take the “mode” subreg of the result.
Won't that get folded to a “mode” move anyway?  It seems more natural
to emit it as a “mode” move from the start.

Also, IMO it looks odd to be guarding the constant case with the
optab check.

Perhaps this is more obvious in the context of the wider patch though.

But yeah, the use of the optab itself looks good.

Thanks,
Richard
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index e9e89c82764..75c160d4349 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -15742,6 +15742,30 @@  ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
     }
 }
 
+/* Expand integer vec_duplicate.  Return true if successful.  */
+
+bool
+ix86_expand_integer_vec_duplicate (rtx *operands)
+{
+  /* Enable VEC_DUPLICATE from a non-standard SSE constant integer only
+     if vector broadcast is available.  */
+  machine_mode mode = GET_MODE (operands[0]);
+  if (CONST_INT_P (operands[1])
+      && (!(TARGET_AVX2
+	    || (TARGET_AVX
+		&& (GET_MODE_INNER (mode) == SImode
+		    || GET_MODE_INNER (mode) == DImode)))
+	  || standard_sse_constant_p (operands[1], mode)))
+    return false;
+
+  bool ok = ix86_expand_vector_init_duplicate (false, mode,
+					       operands[0],
+					       operands[1]);
+  gcc_assert (ok);
+
+  return true;
+}
+
 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
    to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
    The upper bits of DEST are undefined, though they shouldn't cause
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 71745b9a1ea..a6cc09bb75b 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -258,6 +258,7 @@  extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
+extern bool ix86_expand_integer_vec_duplicate (rtx *);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e4f01e64bc1..53a703fb466 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -24640,3 +24640,31 @@  (define_insn "*aes<aeswideklvariant>u8"
   "TARGET_WIDEKL"
   "aes<aeswideklvariant>\t{%0}"
   [(set_attr "type" "other")])
+
+;; Modes handled by broadcast patterns.  NB: Allow V64QI and V32HI with
+;; TARGET_AVX512F since ix86_expand_integer_vec_duplicate can expand
+;; without TARGET_AVX512BW which is used by memset vector broadcast
+;; expander to XI with:
+;; 	vmovd		%edi, %xmm15
+;;	vpbroadcastb	%xmm15, %ymm15
+;;	vinserti64x4	$0x1, %ymm15, %zmm15, %zmm15
+
+(define_mode_iterator INT_BROADCAST_MODE
+  [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
+   (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI
+   (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI
+   (V8DI "TARGET_AVX512F && TARGET_64BIT")
+   (V4DI "TARGET_AVX && TARGET_64BIT") (V2DI "TARGET_64BIT")])
+
+;; Broadcast from an integer.  NB: Enable broadcast only if we can move
+;; from GPR to SSE register directly.
+(define_expand "vec_duplicate<mode>"
+  [(set (match_operand:INT_BROADCAST_MODE 0 "register_operand")
+	(vec_duplicate:INT_BROADCAST_MODE
+	  (match_operand:<ssescalarmode> 1 "general_operand")))]
+  "TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_TO_VEC"
+{
+  if (!ix86_expand_integer_vec_duplicate (operands))
+    FAIL;
+  DONE;
+})
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 1b918144330..a892c94d163 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5077,8 +5077,6 @@  the mode appropriate for one element of @var{m}.
 This pattern only handles duplicates of non-constant inputs.  Constant
 vectors go through the @code{mov@var{m}} pattern instead.
 
-This pattern is not allowed to @code{FAIL}.
-
 @cindex @code{vec_series@var{m}} instruction pattern
 @item @samp{vec_series@var{m}}
 Initialize vector output operand 0 so that element @var{i} is equal to