diff mbox series

[3/8] tree-ifcvt: Enforce zero else value after maskload.

Message ID D3DDVI6X298Z.3P5JTRKAS57ZV@gmail.com
State New
Headers show
Series [1/8] docs: Document maskload else operand and behavior. | expand

Commit Message

Robin Dapp Aug. 11, 2024, 9 p.m. UTC
When predicating a load we implicitly assume that the else value is
zero.  In order to formalize this this patch queries the target for
its supported else operand and uses that for the maskload call.
Subsequently, if the else operand is nonzero, a cond_expr enforcing
a zero else value is emitted.

gcc/ChangeLog:

	* tree-if-conv.cc (predicate_load_or_store): Enforce zero else
	value.
	(predicate_statements): Use sequence instead of statement.
---
 gcc/tree-if-conv.cc | 78 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 62 insertions(+), 16 deletions(-)

Comments

Jeff Law Aug. 13, 2024, 7:30 p.m. UTC | #1
On 8/11/24 3:00 PM, Robin Dapp wrote:
> When predicating a load we implicitly assume that the else value is
> zero.  In order to formalize this this patch queries the target for
> its supported else operand and uses that for the maskload call.
> Subsequently, if the else operand is nonzero, a cond_expr enforcing
> a zero else value is emitted.
> 
> gcc/ChangeLog:
> 
> 	* tree-if-conv.cc (predicate_load_or_store): Enforce zero else
> 	value.
> 	(predicate_statements): Use sequence instead of statement.
> ---
>   gcc/tree-if-conv.cc | 78 +++++++++++++++++++++++++++++++++++----------
>   1 file changed, 62 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 57992b6deca..54cb9ef0ef1 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -2452,10 +2452,12 @@ mask_exists (int size, const vec<int> &vec)
>      write and it needs to be predicated by MASK.  Return a statement
>      that does so.  */
>   
> -static gimple *
> -predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
> +static gimple_seq
> +predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask,
> +			 hash_set<tree_ssa_name_hash> *ssa_names)
Need an update to the function comment for the new SSA_NAMES argument.


> @@ -2789,11 +2829,17 @@ predicate_statements (loop_p loop)
>   		  vect_masks.safe_push (mask);
>   		}
>   	      if (gimple_assign_single_p (stmt))
> -		new_stmt = predicate_load_or_store (&gsi, stmt, mask);
> -	      else
> -		new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
> +		{
> +		  gimple_seq call_seq
> +		    = predicate_load_or_store (&gsi, stmt, mask, &ssa_names);
>   
> -	      gsi_replace (&gsi, new_stmt, true);
> +		  gsi_replace_with_seq (&gsi, call_seq, true);
> +		}
> +	      else
> +		{
> +		  new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
> +		  gsi_replace (&gsi, new_stmt, true);
> +		}
Consider moving NEW_STMT's declaration into the scope where it's only 
set/use occur.


Overall it looks quite reasonable to me.  OK with the nits fixed.

jeff
Richard Biener Aug. 20, 2024, 1:53 p.m. UTC | #2
On Tue, 13 Aug 2024, Jeff Law wrote:

> 
> 
> On 8/11/24 3:00 PM, Robin Dapp wrote:
> > When predicating a load we implicitly assume that the else value is
> > zero.  In order to formalize this this patch queries the target for
> > its supported else operand and uses that for the maskload call.
> > Subsequently, if the else operand is nonzero, a cond_expr enforcing
> > a zero else value is emitted.

Why?  I don't think the vectorizer relies on a particular else
value?  I'd say it would be appropriate for if-conversion to
use "ANY" and for the vectorizer to then pick a supported
version and/or enforce the else value it needs via a blend?

Richard.

> > gcc/ChangeLog:
> > 
> >  * tree-if-conv.cc (predicate_load_or_store): Enforce zero else
> >  value.
> >  (predicate_statements): Use sequence instead of statement.
> > ---
> >   gcc/tree-if-conv.cc | 78 +++++++++++++++++++++++++++++++++++----------
> >   1 file changed, 62 insertions(+), 16 deletions(-)
> > 
> > diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> > index 57992b6deca..54cb9ef0ef1 100644
> > --- a/gcc/tree-if-conv.cc
> > +++ b/gcc/tree-if-conv.cc
> > @@ -2452,10 +2452,12 @@ mask_exists (int size, const vec<int> &vec)
> >      write and it needs to be predicated by MASK.  Return a statement
> >      that does so.  */
> >   -static gimple *
> > -predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree
> > mask)
> > +static gimple_seq
> > +predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree
> > mask,
> > +			 hash_set<tree_ssa_name_hash> *ssa_names)
> Need an update to the function comment for the new SSA_NAMES argument.
> 
> 
> > @@ -2789,11 +2829,17 @@ predicate_statements (loop_p loop)
> >       vect_masks.safe_push (mask);
> >          	}
> >   	      if (gimple_assign_single_p (stmt))
> > -		new_stmt = predicate_load_or_store (&gsi, stmt, mask);
> > -	      else
> > -		new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
> > +		{
> > +		  gimple_seq call_seq
> > +		    = predicate_load_or_store (&gsi, stmt, mask, &ssa_names);
> >   -	      gsi_replace (&gsi, new_stmt, true);
> > +		  gsi_replace_with_seq (&gsi, call_seq, true);
> > +		}
> > +	      else
> > +		{
> > +		  new_stmt = predicate_rhs_code (stmt, mask, cond,
> > &ssa_names);
> > +		  gsi_replace (&gsi, new_stmt, true);
> > +		}
> Consider moving NEW_STMT's declaration into the scope where it's only set/use
> occur.
> 
> 
> Overall it looks quite reasonable to me.  OK with the nits fixed.
> 
> jeff
> 
>
Robin Dapp Aug. 21, 2024, 6:53 a.m. UTC | #3
> > > When predicating a load we implicitly assume that the else value is
> > > zero.  In order to formalize this this patch queries the target for
> > > its supported else operand and uses that for the maskload call.
> > > Subsequently, if the else operand is nonzero, a cond_expr enforcing
> > > a zero else value is emitted.
>
> Why?  I don't think the vectorizer relies on a particular else
> value?  I'd say it would be appropriate for if-conversion to
> use "ANY" and for the vectorizer to then pick a supported
> version and/or enforce the else value it needs via a blend?

In PR115336 we have something like

  _Bool iftmp.0_113;
  _Bool iftmp.0_114;
  iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
  iftmp.0_114 = _47 | iftmp.0_113;

which assumes zeroing.  In order to circumvent that we could use COND_IOR
but I suppose that wouldn't be optimized away even on targets that zero
masked elements?  "ANY" would seem to be wrong here.

So instead, right now the flow is to emit a COND_EXPR after the MASK_LOAD
here if the target does not zero and have the vectorizer vectorize it into
a blend (or something else if the surrounding code allows).

What I didn't do (in the posted version, just locally) is an explicit
VEC_COND_EXPR after each masked (gather/load lanes) call the vectorizer
does.  Do we need that?  AFAICT loop masking (be it len style or
fully-masked style) should be safe.
Richard Biener Aug. 21, 2024, 8:10 a.m. UTC | #4
On Wed, 21 Aug 2024, Robin Dapp wrote:

> > > > When predicating a load we implicitly assume that the else value is
> > > > zero.  In order to formalize this this patch queries the target for
> > > > its supported else operand and uses that for the maskload call.
> > > > Subsequently, if the else operand is nonzero, a cond_expr enforcing
> > > > a zero else value is emitted.
> >
> > Why?  I don't think the vectorizer relies on a particular else
> > value?  I'd say it would be appropriate for if-conversion to
> > use "ANY" and for the vectorizer to then pick a supported
> > version and/or enforce the else value it needs via a blend?
> 
> In PR115336 we have something like
> 
>   _Bool iftmp.0_113;
>   _Bool iftmp.0_114;
>   iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
>   iftmp.0_114 = _47 | iftmp.0_113;
> 
> which assumes zeroing.

I see - is that some trick ifcvt performs?  I can't immediately
see the connection to the PR and it only contains RISC-V assembly
analysis.

> In order to circumvent that we could use COND_IOR
> but I suppose that wouldn't be optimized away even on targets that zero
> masked elements?  "ANY" would seem to be wrong here.

What I was trying to say is that of course any transform we perform
that requires zero-masking should either make .MAKS_LOAD perform
that or add a COND_EXPR.  But it shouldn't be required to make
all .MASK_LOADs be zero-masked, no?

I'm of course fine if you think that's the best way for RISC-V given
other targets are likely unaffected as they can perform zero-masking.

> So instead, right now the flow is to emit a COND_EXPR after the MASK_LOAD
> here if the target does not zero and have the vectorizer vectorize it into
> a blend (or something else if the surrounding code allows).
> 
> What I didn't do (in the posted version, just locally) is an explicit
> VEC_COND_EXPR after each masked (gather/load lanes) call the vectorizer
> does.  Do we need that?  AFAICT loop masking (be it len style or
> fully-masked style) should be safe.

Well, why should we need that?  There seem to be the assumption that
.MASK_LOAD is zero-masked in very few places (PR115336, but not
identified there), if we'd assume this everywhere there would be
way more issues with RISC-V?

For example when we do loop masking I think we elide .COND_op
for "safe" operations.  But even that doesn't assume zero-masking.

Richard.
Robin Dapp Aug. 21, 2024, 10:23 a.m. UTC | #5
> > > Why?  I don't think the vectorizer relies on a particular else
> > > value?  I'd say it would be appropriate for if-conversion to
> > > use "ANY" and for the vectorizer to then pick a supported
> > > version and/or enforce the else value it needs via a blend?
> > 
> > In PR115336 we have something like
> > 
> >   _Bool iftmp.0_113;
> >   _Bool iftmp.0_114;
> >   iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
> >   iftmp.0_114 = _47 | iftmp.0_113;
> > 
> > which assumes zeroing.
>
> I see - is that some trick ifcvt performs?  I can't immediately
> see the connection to the PR and it only contains RISC-V assembly
> analysis.

It happens in predicate_scalar_phi where we build the cond_expr.

After converting

  iftmp.0_114 = PHI <iftmp.0_113(45), 1(38)>

we have

  _BoolD.2746 _47;
  iftmp.0_114 = _47 ? 1 : iftmp.0_113;
which is folded into
  iftmp.0_114 = _47 | iftmp.0_113;

I should really have documented that and more in the PR already...
So it's not an ifcvt trick but rather match.

Another related case is PR116059.

> > In order to circumvent that we could use COND_IOR
> > but I suppose that wouldn't be optimized away even on targets that zero
> > masked elements?  "ANY" would seem to be wrong here.
>
> What I was trying to say is that of course any transform we perform
> that requires zero-masking should either make .MAKS_LOAD perform
> that or add a COND_EXPR.  But it shouldn't be required to make
> all .MASK_LOADs be zero-masked, no?
>
> I'm of course fine if you think that's the best way for RISC-V given
> other targets are likely unaffected as they can perform zero-masking.

No, the less zeroing the better of course :)

Richard S's point was to make the COND_EXPR explicit, so that e.g.
a MASK_LOAD (mask, ..., 1) does not appear cheap as cheap as
MASK_LOAD (mask, ..., 0) on zeroing targets.

From this I kind of jumped to the conclusion (see below) that we might want
it everywhere.

With the patches as is, ifcvt would enforce the zero here while all other
masked-load occurrences in the vectorizer would just query the target's
preferred else value and simply use that without blend/cond_expr.

> > What I didn't do (in the posted version, just locally) is an explicit
> > VEC_COND_EXPR after each masked (gather/load lanes) call the vectorizer
> > does.  Do we need that?  AFAICT loop masking (be it len style or
> > fully-masked style) should be safe.
>
> Well, why should we need that?  There seem to be the assumption that
> .MASK_LOAD is zero-masked in very few places (PR115336, but not
> identified there), if we'd assume this everywhere there would be
> way more issues with RISC-V?

Ok, I was already pretty sure we don't need - and glad to hear it confirmed.
I was just thinking for consistency reasons we might want a masked
load to always look like
  foo123 = .MASK_..._LOAD (mask, ..., else)
  foo124 = COND_EXPR (mask, foo123, 0);
where foo124 would be optimized away (or not even emitted) for zeroing
targets).  That way subsequent code could always rely on zero.
But as this requirement seems very rare it doesn't look like a useful
invariant to enforce.

All in all, it seems we don't need major changes to the approach.
I'm going to work on the comments for the other patches.
Richard Biener Aug. 21, 2024, 10:41 a.m. UTC | #6
On Wed, 21 Aug 2024, Robin Dapp wrote:

> > > > Why?  I don't think the vectorizer relies on a particular else
> > > > value?  I'd say it would be appropriate for if-conversion to
> > > > use "ANY" and for the vectorizer to then pick a supported
> > > > version and/or enforce the else value it needs via a blend?
> > > 
> > > In PR115336 we have something like
> > > 
> > >   _Bool iftmp.0_113;
> > >   _Bool iftmp.0_114;
> > >   iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
> > >   iftmp.0_114 = _47 | iftmp.0_113;
> > > 
> > > which assumes zeroing.
> >
> > I see - is that some trick ifcvt performs?  I can't immediately
> > see the connection to the PR and it only contains RISC-V assembly
> > analysis.
> 
> It happens in predicate_scalar_phi where we build the cond_expr.
> 
> After converting
> 
>   iftmp.0_114 = PHI <iftmp.0_113(45), 1(38)>
> 
> we have
> 
>   _BoolD.2746 _47;
>   iftmp.0_114 = _47 ? 1 : iftmp.0_113;
> which is folded into
>   iftmp.0_114 = _47 | iftmp.0_113;
> 
> I should really have documented that and more in the PR already...
> So it's not an ifcvt trick but rather match.

_47 was the .MASK_LOAD def, right?

It's not exactly obvious what goes wrong - the transform above
is correct - it's only "unexpected" for the lanes that were
masked.  So the actual bug must be downstream from iftmp.0_144.

I think one can try to reason on the ifcvt (scalar) code by
assuming the .MASK_LOAD def would be undefined.  Then we'd
have _47(D) ? 1 : iftmp.0_133 -> _47(D) | iftmp.0_133, I think
that's at most phishy as the COND_EXPR has a well-defined
value while the IOR might spill "undefined" elsewhere causing
divergence.  Is that what is actually happening?

Richard.

> Another related case is PR116059.
> 
> > > In order to circumvent that we could use COND_IOR
> > > but I suppose that wouldn't be optimized away even on targets that zero
> > > masked elements?  "ANY" would seem to be wrong here.
> >
> > What I was trying to say is that of course any transform we perform
> > that requires zero-masking should either make .MAKS_LOAD perform
> > that or add a COND_EXPR.  But it shouldn't be required to make
> > all .MASK_LOADs be zero-masked, no?
> >
> > I'm of course fine if you think that's the best way for RISC-V given
> > other targets are likely unaffected as they can perform zero-masking.
> 
> No, the less zeroing the better of course :)
> 
> Richard S's point was to make the COND_EXPR explicit, so that e.g.
> a MASK_LOAD (mask, ..., 1) does not appear cheap as cheap as
> MASK_LOAD (mask, ..., 0) on zeroing targets.
> 
> From this I kind of jumped to the conclusion (see below) that we might want
> it everywhere.
> 
> With the patches as is, ifcvt would enforce the zero here while all other
> masked-load occurrences in the vectorizer would just query the target's
> preferred else value and simply use that without blend/cond_expr.
> 
> > > What I didn't do (in the posted version, just locally) is an explicit
> > > VEC_COND_EXPR after each masked (gather/load lanes) call the vectorizer
> > > does.  Do we need that?  AFAICT loop masking (be it len style or
> > > fully-masked style) should be safe.
> >
> > Well, why should we need that?  There seem to be the assumption that
> > .MASK_LOAD is zero-masked in very few places (PR115336, but not
> > identified there), if we'd assume this everywhere there would be
> > way more issues with RISC-V?
> 
> Ok, I was already pretty sure we don't need - and glad to hear it confirmed.
> I was just thinking for consistency reasons we might want a masked
> load to always look like
>   foo123 = .MASK_..._LOAD (mask, ..., else)
>   foo124 = COND_EXPR (mask, foo123, 0);
> where foo124 would be optimized away (or not even emitted) for zeroing
> targets).  That way subsequent code could always rely on zero.
> But as this requirement seems very rare it doesn't look like a useful
> invariant to enforce.
> 
> All in all, it seems we don't need major changes to the approach.
> I'm going to work on the comments for the other patches.
> 
>
Robin Dapp Aug. 21, 2024, 11:20 a.m. UTC | #7
> > > >   _Bool iftmp.0_113;
> > > >   _Bool iftmp.0_114;
> > > >   iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
> > > >   iftmp.0_114 = _47 | iftmp.0_113;

> >   _BoolD.2746 _47;
> >   iftmp.0_114 = _47 ? 1 : iftmp.0_113;
> > which is folded into
> >   iftmp.0_114 = _47 | iftmp.0_113;

>
> _47 was the .MASK_LOAD def, right?

_47 is the inverted load mask, iftmp.0_113 is the MASK_LOAD result.
Its mask is _169 where _169 = ~_47;

> It's not exactly obvious what goes wrong - the transform above
> is correct - it's only "unexpected" for the lanes that were
> masked.  So the actual bug must be downstream from iftmp.0_144.
>
> I think one can try to reason on the ifcvt (scalar) code by
> assuming the .MASK_LOAD def would be undefined.  Then we'd
> have _47(D) ? 1 : iftmp.0_133 -> _47(D) | iftmp.0_133, I think
> that's at most phishy as the COND_EXPR has a well-defined
> value while the IOR might spill "undefined" elsewhere causing
> divergence.  Is that what is actually happening?

After vectorization we recognize the mask (_47) as degenerate,
i.e. all ones and, conversely, the masked load mask (_169) is all zeros.
So we shouldn't really load anything.

Optimized we have

  vect_patt_384.36_436 = .MASK_LEN_GATHER_LOAD (_435, vect__47.35_432, 1, { 0, ... }, { 0, ... }, _471, 0);
  vect_iftmp.37_439 = vect_patt_384.36_436 | { 1, ... };

We then re-use a non-zero vector register as masked load result.  Its
stale values cause the wrong result (which should be 1 everywhere).
Richard Biener Aug. 21, 2024, 11:34 a.m. UTC | #8
On Wed, 21 Aug 2024, Robin Dapp wrote:

> > > > >   _Bool iftmp.0_113;
> > > > >   _Bool iftmp.0_114;
> > > > >   iftmp.0_113 = .MASK_LOAD (_170, 8B, _169, _171(D));
> > > > >   iftmp.0_114 = _47 | iftmp.0_113;
> 
> > >   _BoolD.2746 _47;
> > >   iftmp.0_114 = _47 ? 1 : iftmp.0_113;
> > > which is folded into
> > >   iftmp.0_114 = _47 | iftmp.0_113;
> 
> >
> > _47 was the .MASK_LOAD def, right?
> 
> _47 is the inverted load mask, iftmp.0_113 is the MASK_LOAD result.
> Its mask is _169 where _169 = ~_47;
> 
> > It's not exactly obvious what goes wrong - the transform above
> > is correct - it's only "unexpected" for the lanes that were
> > masked.  So the actual bug must be downstream from iftmp.0_144.
> >
> > I think one can try to reason on the ifcvt (scalar) code by
> > assuming the .MASK_LOAD def would be undefined.  Then we'd
> > have _47(D) ? 1 : iftmp.0_133 -> _47(D) | iftmp.0_133, I think
> > that's at most phishy as the COND_EXPR has a well-defined
> > value while the IOR might spill "undefined" elsewhere causing
> > divergence.  Is that what is actually happening?
> 
> After vectorization we recognize the mask (_47) as degenerate,
> i.e. all ones and, conversely, the masked load mask (_169) is all zeros.
> So we shouldn't really load anything.
> 
> Optimized we have
> 
>   vect_patt_384.36_436 = .MASK_LEN_GATHER_LOAD (_435, vect__47.35_432, 1, { 0, ... }, { 0, ... }, _471, 0);
>   vect_iftmp.37_439 = vect_patt_384.36_436 | { 1, ... };
> 
> We then re-use a non-zero vector register as masked load result.  Its
> stale values cause the wrong result (which should be 1 everywhere).

And we fail to fold vect_patt_384.36_436 | { 1, ... } to { 1, ... }?
Or is the issue that vector masks contain padding and with
non-zero masking we'd have garbage in the padding and that leaks
here?  That is, _47 ? 1 : iftmp.0_113 -> _47 | iftmp.0_113 assumes
there's exactly one bit in a bool, specifically it has assumptions
on padding - I'd guess that

 *(char *)p = 17;
 _Bool _47 = *(_Bool *)p;
 ... = _47 ? 1 : b;

would have similar issues but eventually loading 17 into a _Bool
invokes undefined behavior.  So maybe the COND_EXPRs are only
required for .MASK_LOADs of _Bool (or any other type with padding)?

Richard.
Robin Dapp Aug. 21, 2024, 11:54 a.m. UTC | #9
> And we fail to fold vect_patt_384.36_436 | { 1, ... } to { 1, ... }?
> Or is the issue that vector masks contain padding and with
> non-zero masking we'd have garbage in the padding and that leaks
> here?  That is, _47 ? 1 : iftmp.0_113 -> _47 | iftmp.0_113 assumes
> there's exactly one bit in a bool, specifically it has assumptions
> on padding - I'd guess that
>
>  *(char *)p = 17;
>  _Bool _47 = *(_Bool *)p;
>  ... = _47 ? 1 : b;
>
> would have similar issues but eventually loading 17 into a _Bool
> invokes undefined behavior.  So maybe the COND_EXPRs are only
> required for .MASK_LOADs of _Bool (or any other type with padding)?

Hmm yeah, if you put it like that, very likely.  I have only seen
it with _Bool/mask types.  In this PR here the significant bit is
correct and it's the others leaking.
diff mbox series

Patch

diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 57992b6deca..54cb9ef0ef1 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -2452,10 +2452,12 @@  mask_exists (int size, const vec<int> &vec)
    write and it needs to be predicated by MASK.  Return a statement
    that does so.  */
 
-static gimple *
-predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
+static gimple_seq
+predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask,
+			 hash_set<tree_ssa_name_hash> *ssa_names)
 {
-  gcall *new_stmt;
+  gimple_seq stmts = NULL;
+  gcall *call_stmt;
 
   tree lhs = gimple_assign_lhs (stmt);
   tree rhs = gimple_assign_rhs1 (stmt);
@@ -2471,21 +2473,59 @@  predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
 		   ref);
   if (TREE_CODE (lhs) == SSA_NAME)
     {
-      new_stmt
-	= gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
-				      ptr, mask);
-      gimple_call_set_lhs (new_stmt, lhs);
-      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+      /* Get the preferred vector mode and its corresponding mask for the
+	 masked load.  We need this to query the target's supported else
+	 operands.  */
+      machine_mode mode = TYPE_MODE (TREE_TYPE (addr));
+      scalar_mode smode = as_a <scalar_mode> (mode);
+
+      machine_mode vmode = targetm.vectorize.preferred_simd_mode (smode);
+      machine_mode mask_mode
+	= targetm.vectorize.get_mask_mode (vmode).require ();
+
+      int elsval;
+      internal_fn ifn;
+      target_supports_mask_load_store_p (vmode, mask_mode, true, &ifn, &elsval);
+      tree els = vect_get_mask_load_else (elsval, TREE_TYPE (lhs));
+
+      call_stmt
+	= gimple_build_call_internal (IFN_MASK_LOAD, 4, addr,
+				      ptr, mask, els);
+
+      /* Build the load call and, if the else value is nonzero,
+	 a COND_EXPR that enforces it.  */
+      tree loadlhs;
+      if (elsval == MASK_LOAD_ELSE_ZERO)
+	gimple_call_set_lhs (call_stmt, gimple_get_lhs (stmt));
+      else
+	{
+	  loadlhs = make_temp_ssa_name (TREE_TYPE (lhs), NULL, "_ifc_");
+	  ssa_names->add (loadlhs);
+	  gimple_call_set_lhs (call_stmt, loadlhs);
+	}
+      gimple_set_vuse (call_stmt, gimple_vuse (stmt));
+      gimple_seq_add_stmt (&stmts, call_stmt);
+
+      if (elsval != MASK_LOAD_ELSE_ZERO)
+	{
+	  tree cond_rhs
+	    = fold_build_cond_expr (TREE_TYPE (loadlhs), mask, loadlhs,
+				    build_zero_cst (TREE_TYPE (loadlhs)));
+	  gassign *cond_stmt
+	    = gimple_build_assign (gimple_get_lhs (stmt), cond_rhs);
+	  gimple_seq_add_stmt (&stmts, cond_stmt);
+	}
     }
   else
     {
-      new_stmt
+      call_stmt
 	= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
 				      mask, rhs);
-      gimple_move_vops (new_stmt, stmt);
+      gimple_move_vops (call_stmt, stmt);
+      gimple_seq_add_stmt (&stmts, call_stmt);
     }
-  gimple_call_set_nothrow (new_stmt, true);
-  return new_stmt;
+  gimple_call_set_nothrow (call_stmt, true);
+  return stmts;
 }
 
 /* STMT uses OP_LHS.  Check whether it is equivalent to:
@@ -2789,11 +2829,17 @@  predicate_statements (loop_p loop)
 		  vect_masks.safe_push (mask);
 		}
 	      if (gimple_assign_single_p (stmt))
-		new_stmt = predicate_load_or_store (&gsi, stmt, mask);
-	      else
-		new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
+		{
+		  gimple_seq call_seq
+		    = predicate_load_or_store (&gsi, stmt, mask, &ssa_names);
 
-	      gsi_replace (&gsi, new_stmt, true);
+		  gsi_replace_with_seq (&gsi, call_seq, true);
+		}
+	      else
+		{
+		  new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
+		  gsi_replace (&gsi, new_stmt, true);
+		}
 	    }
 	  else if (((lhs = gimple_assign_lhs (stmt)), true)
 		   && (INTEGRAL_TYPE_P (TREE_TYPE (lhs))