diff mbox series

[V2] New pass for sign/zero extension elimination -- not ready for "final" review

Message ID CAMqJFCo-o0j6Mub4=ko0mEpaVDt6hEOsw9xoGm4PbQ8PY-bdcg@mail.gmail.com
State New
Headers show
Series [V2] New pass for sign/zero extension elimination -- not ready for "final" review | expand

Commit Message

Joern Rennecke Nov. 29, 2023, 7:57 p.m. UTC
Attached is what I have for carry_backpropagate .

The utility of special handling for SS_ASHIFT / US_ASHIFT seems
somewhat marginal.

I suspect it'd be more useful to add handling of LSHIFTRT and ASHIFTRT
.  Some ports do
a lot of static shifting.
commit ed47c3d0d38f85c9b4e22bdbd079e0665465ef9c
Author: Joern Rennecke <joern.rennecke@embecosm.com>
Date:   Wed Nov 29 18:46:06 2023 +0000

    * ext-dce.c: Fixes for carry handling.
    
    * ext-dce.c (safe_for_live_propagation): Handle MINUS.
      (ext_dce_process_uses): Break out carry handling into ..
      (carry_backpropagate): This new function.
      Better handling of ASHIFT.
      Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
      US_ASHIFT.

Comments

Joern Rennecke Nov. 29, 2023, 8:05 p.m. UTC | #1
On Wed, 29 Nov 2023 at 19:57, Joern Rennecke
<joern.rennecke@embecosm.com> wrote:
>
> Attached is what I have for carry_backpropagate .
>
> The utility of special handling for SS_ASHIFT / US_ASHIFT seems
> somewhat marginal.
>
> I suspect it'd be more useful to add handling of LSHIFTRT and ASHIFTRT
> .  Some ports do
> a lot of static shifting.

> +    case SS_ASHIFT:
> +    case US_ASHIFT:
> +      if (!mask || XEXP (x, 1) == const0_rtx)
> +       return 0;

P.S.: I just realize that this is a pasto: in the case of a const0_rtx
shift count,
we returning 0 will usually be wrong.  OTOH the code below will handle this
just almost perfectly - the one imperfection being that SS_ASHIFT will see
the sign bit set live if anything is live.  Not that it actually
matters if we track
liveness in 8 / 8 / 16 / 32 bit chunks.
Joern Rennecke Nov. 30, 2023, 2:39 a.m. UTC | #2
On Wed, 29 Nov 2023 at 20:05, Joern Rennecke
<joern.rennecke@embecosm.com> wrote:

> > I suspect it'd be more useful to add handling of LSHIFTRT and ASHIFTRT
> > .  Some ports do
> > a lot of static shifting.
>
> > +    case SS_ASHIFT:
> > +    case US_ASHIFT:
> > +      if (!mask || XEXP (x, 1) == const0_rtx)
> > +       return 0;
>
> P.S.: I just realize that this is a pasto: in the case of a const0_rtx
> shift count,
> we returning 0 will usually be wrong.

I've attached my current patch version.
ext-dce.cc: handle vector modes.
    
    * ext-dce.cc: Amend comment to explain how liveness of vectors is tracked.
      (carry_backpropagate): Use GET_MODE_INNER.
      (ext_dce_process_sets): Likewise.  Only apply big endian correction for
      subregs if they don't have a vector mode.
      (ext_cde_process_uses): Likewise.

    * ext-dce.cc: carry_backpropagate: [US]S_ASHIFT fix, handle [LA]SHIFTRT
    
    * ext-dce.cc (safe_for_live_propagation): Add LSHIFTRT and ASHIFTRT.
      (carry_backpropagate): Reformat top comment.
      Add handling of LSHIFTRT and ASHIFTRT.
      Fix bit count for [SU]MUL_HIGHPART.
      Fix pasto for [SU]S_ASHIFT.

    * ext-dce.c: Fixes for carry handling.
    
    * ext-dce.c (safe_for_live_propagation): Handle MINUS.
      (ext_dce_process_uses): Break out carry handling into ..
      (carry_backpropagate): This new function.
      Better handling of ASHIFT.
      Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
      US_ASHIFT.

    * ext-dce.c: fix SUBREG_BYTE test
    
    As mentioned in
    https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637486.html
    and
    https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638473.html


diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 4e4c57de117..228c50e8b73 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -38,7 +38,10 @@ along with GCC; see the file COPYING3.  If not see
    bit 0..7   (least significant byte)
    bit 8..15  (second least significant byte)
    bit 16..31
-   bit 32..BITS_PER_WORD-1  */
+   bit 32..BITS_PER_WORD-1
+
+   For vector modes, we apply these bit groups to every lane; if any of the
+   bits in the group are live in any lane, we consider this group live.  */
 
 /* Note this pass could be used to narrow memory loads too.  It's
    not clear if that's profitable or not in general.  */
@@ -83,6 +86,7 @@ safe_for_live_propagation (rtx_code code)
     case SIGN_EXTEND:
     case TRUNCATE:
     case PLUS:
+    case MINUS:
     case MULT:
     case SMUL_HIGHPART:
     case UMUL_HIGHPART:
@@ -96,6 +100,8 @@ safe_for_live_propagation (rtx_code code)
     case SS_ASHIFT:
     case US_ASHIFT:
     case ASHIFT:
+    case LSHIFTRT:
+    case ASHIFTRT:
       return true;
 
     /* There may be other safe codes.  If so they can be added
@@ -215,13 +221,22 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap livenow, bitmap live_tmp)
 
 	  /* Phase one of destination handling.  First remove any wrapper
 	     such as SUBREG or ZERO_EXTRACT.  */
-	  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (GET_MODE (x));
+	  unsigned HOST_WIDE_INT mask
+	    = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
 	  if (SUBREG_P (x)
 	      && !paradoxical_subreg_p (x)
 	      && SUBREG_BYTE (x).is_constant ())
 	    {
-	      bit = subreg_lsb (x).to_constant ();
-	      mask = GET_MODE_MASK (GET_MODE (SUBREG_REG (x))) << bit;
+	      enum machine_mode omode = GET_MODE_INNER (GET_MODE (x));
+	      enum machine_mode imode = GET_MODE (SUBREG_REG (x));
+	      bit = 0;
+	      if (!VECTOR_MODE_P (GET_MODE (x))
+		  || (GET_MODE_SIZE (imode).is_constant ()
+		      && (GET_MODE_SIZE (omode).to_constant ()
+			  > GET_MODE_SIZE (imode).to_constant ())))
+		bit = subreg_lsb (x).to_constant ();
+	      mask = (GET_MODE_MASK (GET_MODE_INNER (GET_MODE (SUBREG_REG (x))))
+		      << bit);
 	      gcc_assert (mask);
 	      if (!mask)
 		mask = -0x100000000ULL;
@@ -365,6 +380,84 @@ binop_implies_op2_fully_live (rtx_code code)
     }
 }
 
+/* X, with code CODE, is an operation for which safe_for_live_propagation
+   holds true, and bits set in MASK are live in the result.  Compute a
+   mask of (potentially) live bits in the non-constant inputs.  In case of
+   binop_implies_op2_fully_live (e.g. shifts), the computed mask may
+   exclusively pertain to the first operand.  */
+
+HOST_WIDE_INT
+carry_backpropagate (HOST_WIDE_INT mask, enum rtx_code code, rtx x)
+{
+  enum machine_mode mode = GET_MODE_INNER (GET_MODE (x));
+  HOST_WIDE_INT mmask = GET_MODE_MASK (mode);
+  switch (code)
+    {
+    case ASHIFT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	return mask >> INTVAL (XEXP (x, 1));
+      /* Fall through.  */
+    case PLUS: case MINUS:
+    case MULT:
+      return mask ? ((2ULL << floor_log2 (mask)) - 1) : 0;
+    case LSHIFTRT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	return mmask & (mask << INTVAL (XEXP (x, 1)));
+      return mmask;
+    case ASHIFTRT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	{
+	  HOST_WIDE_INT sign = 0;
+	  if (HOST_BITS_PER_WIDE_INT - clz_hwi (mask) + INTVAL (XEXP (x, 1))
+	      > GET_MODE_BITSIZE (mode).to_constant ())
+	    sign = (1ULL << GET_MODE_BITSIZE (mode).to_constant ()) - 1;
+	  return sign | (mmask & (mask << INTVAL (XEXP (x, 1))));
+	}
+      return mmask;
+    case SMUL_HIGHPART: case UMUL_HIGHPART:
+      if (!mask || XEXP (x, 1) == const0_rtx)
+	return 0;
+      if (CONSTANT_P (XEXP (x, 1)))
+	{
+	  if (pow2p_hwi (INTVAL (XEXP (x, 1))))
+	    return mmask & (mask << (GET_MODE_BITSIZE (mode).to_constant ()
+				     - exact_log2 (INTVAL (XEXP (x, 1)))));
+
+	  int bits = (HOST_BITS_PER_WIDE_INT
+		      + GET_MODE_BITSIZE (mode).to_constant ()
+		      - clz_hwi (mask) - ctz_hwi (INTVAL (XEXP (x, 1))));
+	  if (bits < GET_MODE_BITSIZE (mode).to_constant ())
+	    return (1ULL << bits) - 1;
+	}
+      return mmask;
+    case SIGN_EXTEND:
+      if (mask & ~mmask)
+	mask |= 1ULL << (GET_MODE_BITSIZE (mode).to_constant () - 1);
+      return mask;
+
+    /* We propagate for the shifted operand, but not the shift
+       count.  The count is handled specially.  */
+    case SS_ASHIFT:
+    case US_ASHIFT:
+      if (!mask)
+	return 0;
+      if (CONSTANT_P (XEXP (x, 1))
+	  && UINTVAL (XEXP (x, 1)) < GET_MODE_BITSIZE (mode).to_constant ())
+	{
+	  return ((mmask & ~((unsigned HOST_WIDE_INT)mmask
+			     >> (INTVAL (XEXP (x, 1))
+				 + (XEXP (x, 1) != const0_rtx
+				    && code == SS_ASHIFT))))
+		  | (mask >> INTVAL (XEXP (x, 1))));
+	}
+      return mmask;
+    default:
+      return mask;
+    }
+}
 /* Process uses in INSN contained in OBJ.  Set appropriate bits in LIVENOW
    for any chunks of pseudos that become live, potentially filtering using
    bits from LIVE_TMP.
@@ -414,11 +507,19 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 
 	  /* ?!? How much of this should mirror SET handling, potentially
 	     being shared?   */
-	  if (SUBREG_BYTE (dst).is_constant () && SUBREG_P (dst))
+	  if (SUBREG_P (dst) && SUBREG_BYTE (dst).is_constant ())
 	    {
-	      bit = subreg_lsb (dst).to_constant ();
-	      if (bit >= HOST_BITS_PER_WIDE_INT)
-		bit = HOST_BITS_PER_WIDE_INT - 1;
+	      enum machine_mode omode = GET_MODE_INNER (GET_MODE (dst));
+	      enum machine_mode imode = GET_MODE (SUBREG_REG (dst));
+	      if (!VECTOR_MODE_P (GET_MODE (dst))
+		  || (GET_MODE_SIZE (imode).is_constant ()
+		      && (GET_MODE_SIZE (omode).to_constant ()
+			  > GET_MODE_SIZE (imode).to_constant ())))
+		{
+		  bit = subreg_lsb (dst).to_constant ();
+		  if (bit >= HOST_BITS_PER_WIDE_INT)
+		    bit = HOST_BITS_PER_WIDE_INT - 1;
+		}
 	      dst = SUBREG_REG (dst);
 	    }
 	  else if (GET_CODE (dst) == ZERO_EXTRACT
@@ -464,7 +565,7 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 		{
 		  rtx inner = XEXP (src, 0);
 		  unsigned HOST_WIDE_INT src_mask
-		    = GET_MODE_MASK (GET_MODE (inner));
+		    = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (inner)));
 
 		  /* DST_MASK could be zero if we had something in the SET
 		     that we couldn't handle.  */
@@ -480,11 +581,7 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 		 sure everything that should get marked as live is marked
 		 from here onward.  */
 
-	      /* ?!? What is the point of this adjustment to DST_MASK?  */
-	      if (code == PLUS || code == MINUS
-		  || code == MULT || code == ASHIFT)
-		dst_mask
-		  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;
+	      dst_mask = carry_backpropagate (dst_mask, code, src);
 
 	      /* We will handle the other operand of a binary operator
 		 at the bottom of the loop by resetting Y.  */
@@ -516,12 +613,20 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 			 and process normally (conservatively).  */
 		      if (!REG_P (SUBREG_REG (y)))
 			break;
-		      bit = subreg_lsb (y).to_constant ();
-		      if (dst_mask)
+		      enum machine_mode omode = GET_MODE_INNER (GET_MODE (y));
+		      enum machine_mode imode = GET_MODE (SUBREG_REG (y));
+		      if (!VECTOR_MODE_P (GET_MODE (y))
+			  || (GET_MODE_SIZE (imode).is_constant ()
+			      && (GET_MODE_SIZE (omode).to_constant ()
+				  > GET_MODE_SIZE (imode).to_constant ())))
 			{
-			  dst_mask <<= bit;
-			  if (!dst_mask)
-			    dst_mask = -0x100000000ULL;
+			  bit = subreg_lsb (y).to_constant ();
+			  if (dst_mask)
+			    {
+			      dst_mask <<= bit;
+			      if (!dst_mask)
+				dst_mask = -0x100000000ULL;
+			    }
 			}
 		      y = SUBREG_REG (y);
 		    }
@@ -539,7 +644,8 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 			 propagate destination liveness through, then just
 			 set the mask to the mode's mask.  */
 		      if (!safe_for_live_propagation (code))
-			tmp_mask = GET_MODE_MASK (GET_MODE (y));
+			tmp_mask
+			  = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (y)));
 
 		      if (tmp_mask & 0xff)
 			bitmap_set_bit (livenow, rn);
Joern Rennecke Nov. 30, 2023, 4:10 a.m. UTC | #3
I originally computed mmask in carry_backpropagate from XEXP (x, 0),
but abandoned that when I realized we also get called for RTX_OBJ
things.  I forgot to adjust the SIGN_EXTEND code, though.  Fixed
in the attached revised patch.  Also made sure to not make inputs
of LSHIFTRT / ASHIFTRT live if the output is dead (and commened
the checks for (mask == 0) in the process).

Something that could be done to futher simplif the code is to make
carry_backpropagate do all the rtx_code-dependent propagation
decisions.  I.e. would have cases for RTX_OBJ, AND, OR, IOR etc
that propagate the mask, and the default action would be to make
the input live (after the check not make any bits in the input
live if the output is dead).

Then we wouldn't need safe_for_live_propagation any more.

Not sure if carry_backpropagate would then still be a suitable name
anymore, though.
* ext-dce.cc (carry_backpropagate): Always return 0 when output is dead.  Fix SIGN_EXTEND input mask.

    * ext-dce.cc: handle vector modes.
    
    * ext-dce.cc: Amend comment to explain how liveness of vectors is tracked.
      (carry_backpropagate): Use GET_MODE_INNER.
      (ext_dce_process_sets): Likewise.  Only apply big endian correction for
      subregs if they don't have a vector mode.
      (ext_cde_process_uses): Likewise.

    * ext-dce.cc: carry_backpropagate: [US]S_ASHIFT fix, handle [LA]SHIFTRT
    
    * ext-dce.cc (safe_for_live_propagation): Add LSHIFTRT and ASHIFTRT.
      (carry_backpropagate): Reformat top comment.
      Add handling of LSHIFTRT and ASHIFTRT.
      Fix bit count for [SU]MUL_HIGHPART.
      Fix pasto for [SU]S_ASHIFT.

    * ext-dce.c: Fixes for carry handling.
    
    * ext-dce.c (safe_for_live_propagation): Handle MINUS.
      (ext_dce_process_uses): Break out carry handling into ..
      (carry_backpropagate): This new function.
      Better handling of ASHIFT.
      Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
      US_ASHIFT.

    * ext-dce.c: fix SUBREG_BYTE test
    
    As mentioned in
    https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637486.html
    and
    https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638473.html

diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 4e4c57de117..fd80052ad75 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -38,7 +38,10 @@ along with GCC; see the file COPYING3.  If not see
    bit 0..7   (least significant byte)
    bit 8..15  (second least significant byte)
    bit 16..31
-   bit 32..BITS_PER_WORD-1  */
+   bit 32..BITS_PER_WORD-1
+
+   For vector modes, we apply these bit groups to every lane; if any of the
+   bits in the group are live in any lane, we consider this group live.  */
 
 /* Note this pass could be used to narrow memory loads too.  It's
    not clear if that's profitable or not in general.  */
@@ -83,6 +86,7 @@ safe_for_live_propagation (rtx_code code)
     case SIGN_EXTEND:
     case TRUNCATE:
     case PLUS:
+    case MINUS:
     case MULT:
     case SMUL_HIGHPART:
     case UMUL_HIGHPART:
@@ -96,6 +100,8 @@ safe_for_live_propagation (rtx_code code)
     case SS_ASHIFT:
     case US_ASHIFT:
     case ASHIFT:
+    case LSHIFTRT:
+    case ASHIFTRT:
       return true;
 
     /* There may be other safe codes.  If so they can be added
@@ -215,13 +221,22 @@ ext_dce_process_sets (rtx_insn *insn, rtx obj, bitmap livenow, bitmap live_tmp)
 
 	  /* Phase one of destination handling.  First remove any wrapper
 	     such as SUBREG or ZERO_EXTRACT.  */
-	  unsigned HOST_WIDE_INT mask = GET_MODE_MASK (GET_MODE (x));
+	  unsigned HOST_WIDE_INT mask
+	    = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
 	  if (SUBREG_P (x)
 	      && !paradoxical_subreg_p (x)
 	      && SUBREG_BYTE (x).is_constant ())
 	    {
-	      bit = subreg_lsb (x).to_constant ();
-	      mask = GET_MODE_MASK (GET_MODE (SUBREG_REG (x))) << bit;
+	      enum machine_mode omode = GET_MODE_INNER (GET_MODE (x));
+	      enum machine_mode imode = GET_MODE (SUBREG_REG (x));
+	      bit = 0;
+	      if (!VECTOR_MODE_P (GET_MODE (x))
+		  || (GET_MODE_SIZE (imode).is_constant ()
+		      && (GET_MODE_SIZE (omode).to_constant ()
+			  > GET_MODE_SIZE (imode).to_constant ())))
+		bit = subreg_lsb (x).to_constant ();
+	      mask = (GET_MODE_MASK (GET_MODE_INNER (GET_MODE (SUBREG_REG (x))))
+		      << bit);
 	      gcc_assert (mask);
 	      if (!mask)
 		mask = -0x100000000ULL;
@@ -365,6 +380,85 @@ binop_implies_op2_fully_live (rtx_code code)
     }
 }
 
+/* X, with code CODE, is an operation for which safe_for_live_propagation
+   holds true, and bits set in MASK are live in the result.  Compute a
+   mask of (potentially) live bits in the non-constant inputs.  In case of
+   binop_implies_op2_fully_live (e.g. shifts), the computed mask may
+   exclusively pertain to the first operand.  */
+
+HOST_WIDE_INT
+carry_backpropagate (HOST_WIDE_INT mask, enum rtx_code code, rtx x)
+{
+  if (mask == 0)
+    return 0;
+
+  enum machine_mode mode = GET_MODE_INNER (GET_MODE (x));
+  HOST_WIDE_INT mmask = GET_MODE_MASK (mode);
+  switch (code)
+    {
+    case ASHIFT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	return mask >> INTVAL (XEXP (x, 1));
+      /* Fall through.  */
+    case PLUS: case MINUS:
+    case MULT:
+      return (2ULL << floor_log2 (mask)) - 1;
+    case LSHIFTRT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	return mmask & (mask << INTVAL (XEXP (x, 1)));
+      return mmask;
+    case ASHIFTRT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	{
+	  HOST_WIDE_INT sign = 0;
+	  if (HOST_BITS_PER_WIDE_INT - clz_hwi (mask) + INTVAL (XEXP (x, 1))
+	      > GET_MODE_BITSIZE (mode).to_constant ())
+	    sign = (1ULL << GET_MODE_BITSIZE (mode).to_constant ()) - 1;
+	  return sign | (mmask & (mask << INTVAL (XEXP (x, 1))));
+	}
+      return mmask;
+    case SMUL_HIGHPART: case UMUL_HIGHPART:
+      if (XEXP (x, 1) == const0_rtx)
+	return 0;
+      if (CONSTANT_P (XEXP (x, 1)))
+	{
+	  if (pow2p_hwi (INTVAL (XEXP (x, 1))))
+	    return mmask & (mask << (GET_MODE_BITSIZE (mode).to_constant ()
+				     - exact_log2 (INTVAL (XEXP (x, 1)))));
+
+	  int bits = (HOST_BITS_PER_WIDE_INT
+		      + GET_MODE_BITSIZE (mode).to_constant ()
+		      - clz_hwi (mask) - ctz_hwi (INTVAL (XEXP (x, 1))));
+	  if (bits < GET_MODE_BITSIZE (mode).to_constant ())
+	    return (1ULL << bits) - 1;
+	}
+      return mmask;
+    case SIGN_EXTEND:
+      if (mask & ~GET_MODE_MASK (GET_MODE_INNER (GET_MODE (XEXP (x, 0)))));
+	mask |= 1ULL << (GET_MODE_BITSIZE (mode).to_constant () - 1);
+      return mask;
+
+    /* We propagate for the shifted operand, but not the shift
+       count.  The count is handled specially.  */
+    case SS_ASHIFT:
+    case US_ASHIFT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && UINTVAL (XEXP (x, 1)) < GET_MODE_BITSIZE (mode).to_constant ())
+	{
+	  return ((mmask & ~((unsigned HOST_WIDE_INT)mmask
+			     >> (INTVAL (XEXP (x, 1))
+				 + (XEXP (x, 1) != const0_rtx
+				    && code == SS_ASHIFT))))
+		  | (mask >> INTVAL (XEXP (x, 1))));
+	}
+      return mmask;
+    default:
+      return mask;
+    }
+}
 /* Process uses in INSN contained in OBJ.  Set appropriate bits in LIVENOW
    for any chunks of pseudos that become live, potentially filtering using
    bits from LIVE_TMP.
@@ -414,11 +508,19 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 
 	  /* ?!? How much of this should mirror SET handling, potentially
 	     being shared?   */
-	  if (SUBREG_BYTE (dst).is_constant () && SUBREG_P (dst))
+	  if (SUBREG_P (dst) && SUBREG_BYTE (dst).is_constant ())
 	    {
-	      bit = subreg_lsb (dst).to_constant ();
-	      if (bit >= HOST_BITS_PER_WIDE_INT)
-		bit = HOST_BITS_PER_WIDE_INT - 1;
+	      enum machine_mode omode = GET_MODE_INNER (GET_MODE (dst));
+	      enum machine_mode imode = GET_MODE (SUBREG_REG (dst));
+	      if (!VECTOR_MODE_P (GET_MODE (dst))
+		  || (GET_MODE_SIZE (imode).is_constant ()
+		      && (GET_MODE_SIZE (omode).to_constant ()
+			  > GET_MODE_SIZE (imode).to_constant ())))
+		{
+		  bit = subreg_lsb (dst).to_constant ();
+		  if (bit >= HOST_BITS_PER_WIDE_INT)
+		    bit = HOST_BITS_PER_WIDE_INT - 1;
+		}
 	      dst = SUBREG_REG (dst);
 	    }
 	  else if (GET_CODE (dst) == ZERO_EXTRACT
@@ -464,7 +566,7 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 		{
 		  rtx inner = XEXP (src, 0);
 		  unsigned HOST_WIDE_INT src_mask
-		    = GET_MODE_MASK (GET_MODE (inner));
+		    = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (inner)));
 
 		  /* DST_MASK could be zero if we had something in the SET
 		     that we couldn't handle.  */
@@ -480,11 +582,7 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 		 sure everything that should get marked as live is marked
 		 from here onward.  */
 
-	      /* ?!? What is the point of this adjustment to DST_MASK?  */
-	      if (code == PLUS || code == MINUS
-		  || code == MULT || code == ASHIFT)
-		dst_mask
-		  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;
+	      dst_mask = carry_backpropagate (dst_mask, code, src);
 
 	      /* We will handle the other operand of a binary operator
 		 at the bottom of the loop by resetting Y.  */
@@ -516,12 +614,20 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 			 and process normally (conservatively).  */
 		      if (!REG_P (SUBREG_REG (y)))
 			break;
-		      bit = subreg_lsb (y).to_constant ();
-		      if (dst_mask)
+		      enum machine_mode omode = GET_MODE_INNER (GET_MODE (y));
+		      enum machine_mode imode = GET_MODE (SUBREG_REG (y));
+		      if (!VECTOR_MODE_P (GET_MODE (y))
+			  || (GET_MODE_SIZE (imode).is_constant ()
+			      && (GET_MODE_SIZE (omode).to_constant ()
+				  > GET_MODE_SIZE (imode).to_constant ())))
 			{
-			  dst_mask <<= bit;
-			  if (!dst_mask)
-			    dst_mask = -0x100000000ULL;
+			  bit = subreg_lsb (y).to_constant ();
+			  if (dst_mask)
+			    {
+			      dst_mask <<= bit;
+			      if (!dst_mask)
+				dst_mask = -0x100000000ULL;
+			    }
 			}
 		      y = SUBREG_REG (y);
 		    }
@@ -539,7 +645,8 @@ ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 			 propagate destination liveness through, then just
 			 set the mask to the mode's mask.  */
 		      if (!safe_for_live_propagation (code))
-			tmp_mask = GET_MODE_MASK (GET_MODE (y));
+			tmp_mask
+			  = GET_MODE_MASK (GET_MODE_INNER (GET_MODE (y)));
 
 		      if (tmp_mask & 0xff)
 			bitmap_set_bit (livenow, rn);
Jeff Law Nov. 30, 2023, 3:46 p.m. UTC | #4
On 11/29/23 19:39, Joern Rennecke wrote:
> On Wed, 29 Nov 2023 at 20:05, Joern Rennecke
> <joern.rennecke@embecosm.com> wrote:
> 
>>> I suspect it'd be more useful to add handling of LSHIFTRT and ASHIFTRT
>>> .  Some ports do
>>> a lot of static shifting.
>>
>>> +    case SS_ASHIFT:
>>> +    case US_ASHIFT:
>>> +      if (!mask || XEXP (x, 1) == const0_rtx)
>>> +       return 0;
>>
>> P.S.: I just realize that this is a pasto: in the case of a const0_rtx
>> shift count,
>> we returning 0 will usually be wrong.
> 
> I've attached my current patch version.
I would strongly suggest we not start adding a lot of new capabilities 
here, deferring such work until gcc-15.  Our focus should be getting the 
existing code working well and cleanly implemented.

Jeff
Jeff Law Nov. 30, 2023, 5:53 p.m. UTC | #5
On 11/29/23 19:39, Joern Rennecke wrote:
> On Wed, 29 Nov 2023 at 20:05, Joern Rennecke
> <joern.rennecke@embecosm.com>  wrote:
> 
>>> I suspect it'd be more useful to add handling of LSHIFTRT and ASHIFTRT
>>> .  Some ports do
>>> a lot of static shifting.
>>> +    case SS_ASHIFT:
>>> +    case US_ASHIFT:
>>> +      if (!mask || XEXP (x, 1) == const0_rtx)
>>> +       return 0;
>> P.S.: I just realize that this is a pasto: in the case of a const0_rtx
>> shift count,
>> we returning 0 will usually be wrong.
> I've attached my current patch version.
> 
> 
> tmp.txt
> 
>      ext-dce.cc: handle vector modes.
>      
>      * ext-dce.cc: Amend comment to explain how liveness of vectors is tracked.
>        (carry_backpropagate): Use GET_MODE_INNER.
>        (ext_dce_process_sets): Likewise.  Only apply big endian correction for
>        subregs if they don't have a vector mode.
>        (ext_cde_process_uses): Likewise.
> 
>      * ext-dce.cc: carry_backpropagate: [US]S_ASHIFT fix, handle [LA]SHIFTRT
>      
>      * ext-dce.cc (safe_for_live_propagation): Add LSHIFTRT and ASHIFTRT.
>        (carry_backpropagate): Reformat top comment.
>        Add handling of LSHIFTRT and ASHIFTRT.
>        Fix bit count for [SU]MUL_HIGHPART.
>        Fix pasto for [SU]S_ASHIFT.
> 
>      * ext-dce.c: Fixes for carry handling.
>      
>      * ext-dce.c (safe_for_live_propagation): Handle MINUS.
>        (ext_dce_process_uses): Break out carry handling into ..
>        (carry_backpropagate): This new function.
>        Better handling of ASHIFT.
>        Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
>        US_ASHIFT.
> 
>      * ext-dce.c: fix SUBREG_BYTE test
>      
>      As mentioned in
>      https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637486.html
>      and
>      https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638473.html
> 
> 
> diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
> index 4e4c57de117..228c50e8b73 100644
> --- a/gcc/ext-dce.cc
> +++ b/gcc/ext-dce.cc
> @@ -38,7 +38,10 @@ along with GCC; see the file COPYING3.  If not see
>      bit 0..7   (least significant byte)
>      bit 8..15  (second least significant byte)
>      bit 16..31
> -   bit 32..BITS_PER_WORD-1  */
> +   bit 32..BITS_PER_WORD-1
> +
> +   For vector modes, we apply these bit groups to every lane; if any of the
> +   bits in the group are live in any lane, we consider this group live.  */
Why add vector modes now?  I realize it might help a vectorized sub*_dct 
from x264, but I was thinking that would be more of a gcc-15 improvement.

Was this in fact to help sub*_dct or something else concrete?  Unless 
it's concrete and significant, I'd lean towards deferring the 
enhancement until gcc-15.

>   
>   /* Note this pass could be used to narrow memory loads too.  It's
>      not clear if that's profitable or not in general.  */

> @@ -96,6 +100,8 @@ safe_for_live_propagation (rtx_code code)
>       case SS_ASHIFT:
>       case US_ASHIFT:
>       case ASHIFT:
> +    case LSHIFTRT:
> +    case ASHIFTRT:
>         return true;
So this starts to touch on a cleanup Richard mentioned.  The codes in 
there until now were supposed to be safe across the board.  As we add 
things like LSHIFTRT, we need to describe how to handle liveness 
transfer from the destination into the source(s).  I think what Richard 
is asking for is to just have one place which handles both.

Anyway, my current plan would be to pull in the formatting fixes, the 
back propagation without the vector enhancement.  We've already fixed 
the subreg thingie locally.


jeff
Joern Rennecke Nov. 30, 2023, 6:31 p.m. UTC | #6
On Thu, 30 Nov 2023 at 17:53, Jeff Law <jeffreyalaw@gmail.com> wrote:
 > >      * ext-dce.c: Fixes for carry handling.
> >
> >      * ext-dce.c (safe_for_live_propagation): Handle MINUS.
> >        (ext_dce_process_uses): Break out carry handling into ..
> >        (carry_backpropagate): This new function.
> >        Better handling of ASHIFT.
> >        Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
> >        US_ASHIFT.
> >
> >      * ext-dce.c: fix SUBREG_BYTE test
> >
> >      As mentioned in
> >      https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637486.html
> >      and
> >      https://gcc.gnu.org/pipermail/gcc-patches/2023-November/638473.html
> >
> >
> > diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
> > index 4e4c57de117..228c50e8b73 100644
> > --- a/gcc/ext-dce.cc
> > +++ b/gcc/ext-dce.cc
> > @@ -38,7 +38,10 @@ along with GCC; see the file COPYING3.  If not see
> >      bit 0..7   (least significant byte)
> >      bit 8..15  (second least significant byte)
> >      bit 16..31
> > -   bit 32..BITS_PER_WORD-1  */
> > +   bit 32..BITS_PER_WORD-1
> > +
> > +   For vector modes, we apply these bit groups to every lane; if any of the
> > +   bits in the group are live in any lane, we consider this group live.  */
> Why add vector modes now?  I realize it might help a vectorized sub*_dct
> from x264, but I was thinking that would be more of a gcc-15 improvement.

Actually, we already did, but because it was unintentional, it wasn't
done properly.

I've been using BEG_MODE_BITSIZE(GET_MODE (x)).to_constant, thinking a mode
should just have a constant size that can easily fit into an int.  I was wrong.
Debugging found that was a scalable vector mode.  SUBREGs, shifts and
other stuff
has vector modes and goes through the code.  Instead of adding code to bail our,
I though it would be a good idea to think about how vector modes can
be supported
without balooning the computation time or memory.  And keeping in mind
the original
intention of the patch - eliminating redundant sign/zero extension -
that actually can
applied to vectors as well, and that means we should consider how
these operations
work on each lane.

By looking at the inner mode of a vector, we also conventiently also
get a sane size.
For complex numbers, it's also saner to treat them as two-element vectors, tham
trying to apply the bit parts while ignoring the structure, so it
makes sense to use
GET_MODE_INNER in general.

Something that could be done for further improvement but seems too complex
for gcc 14 would be to handle vector constants as shift counts.

Come to think of it, I actually applied the wrong test for the integer
shift counts -
it should be CONST_INT_P, not CONSTANT_P.

> >
> >   /* Note this pass could be used to narrow memory loads too.  It's
> >      not clear if that's profitable or not in general.  */
>
> > @@ -96,6 +100,8 @@ safe_for_live_propagation (rtx_code code)
> >       case SS_ASHIFT:
> >       case US_ASHIFT:
> >       case ASHIFT:
> > +    case LSHIFTRT:
> > +    case ASHIFTRT:
> >         return true;
> So this starts to touch on a cleanup Richard mentioned.  The codes in
> there until now were supposed to be safe across the board.

Saturating operations are not safe at all without explicitly computing
the liveness propagation.

>  As we add
> things like LSHIFTRT, we need to describe how to handle liveness
> transfer from the destination into the source(s).  I think what Richard
> is asking for is to just have one place which handles both.

LSHIFTRT is much simpler than the saturating operations.

> Anyway, my current plan would be to pull in the formatting fixes, the
> back propagation without the vector enhancement.

Pretending the vector modes don't happen is not making the code safe.
We have to handle them somehow, so we might as well do that in a way
that is consistent and gives more potential for optimization.
Jeff Law Nov. 30, 2023, 7:15 p.m. UTC | #7
On 11/30/23 11:31, Joern Rennecke wrote:

> 
> Pretending the vector modes don't happen is not making the code safe.
> We have to handle them somehow, so we might as well do that in a way
> that is consistent and gives more potential for optimization.
We're not pretending they don't happen.  Quite the opposite.  When we 
see them we need to take appropriate action.

For a set you can ignore since that means we'll keep objects live longer 
than they normally would have been -- which is the safe thing for this pass.

For a use, you can't ignore, ever.  You must always make live anything 
that is potentially used or you run the risk of incorrect code generation.

If that's not what we're doing now, then let's fix that without 
introducing a whole new set of optimizations that need to be analyzed 
and debugged.

I would have been all for this work a month ago, but we're past stage1 
close so the focus needs to be on cleaning up what we've got for gcc-14.

Jeff
Jeff Law Nov. 30, 2023, 9:32 p.m. UTC | #8
On 11/30/23 14:22, Jeff Law wrote:

> So if you're going to add opcodes in here, don't they also need to be in 
> safe_for_propagation_p as well?  Otherwise they won't get used, right? 
> I'm thinking about the HIGHPART cases for example.  As well as the 
> saturating shifts which we indicated we were going to take out of 
> safe_for_propagation_p IIRC.
Ignore my comment about HIGHPART :-)

Jeff
Jeff Law Dec. 12, 2023, 5:18 p.m. UTC | #9
On 11/29/23 21:10, Joern Rennecke wrote:
>   I originally computed mmask in carry_backpropagate from XEXP (x, 0),
> but abandoned that when I realized we also get called for RTX_OBJ
> things.  I forgot to adjust the SIGN_EXTEND code, though.  Fixed
> in the attached revised patch.  Also made sure to not make inputs
> of LSHIFTRT / ASHIFTRT live if the output is dead (and commened
> the checks for (mask == 0) in the process).
> 
> Something that could be done to futher simplif the code is to make
> carry_backpropagate do all the rtx_code-dependent propagation
> decisions.  I.e. would have cases for RTX_OBJ, AND, OR, IOR etc
> that propagate the mask, and the default action would be to make
> the input live (after the check not make any bits in the input
> live if the output is dead).
> 
> Then we wouldn't need safe_for_live_propagation any more.
> 
> Not sure if carry_backpropagate would then still be a suitable name
> anymore, though.
> 
> 
> tmp.txt
> 
>      * ext-dce.cc (carry_backpropagate): Always return 0 when output is dead.  Fix SIGN_EXTEND input mask.
> 
>      * ext-dce.cc: handle vector modes.
>      
>      * ext-dce.cc: Amend comment to explain how liveness of vectors is tracked.
>        (carry_backpropagate): Use GET_MODE_INNER.
>        (ext_dce_process_sets): Likewise.  Only apply big endian correction for
>        subregs if they don't have a vector mode.
>        (ext_cde_process_uses): Likewise.
> 
>      * ext-dce.cc: carry_backpropagate: [US]S_ASHIFT fix, handle [LA]SHIFTRT
>      
>      * ext-dce.cc (safe_for_live_propagation): Add LSHIFTRT and ASHIFTRT.
>        (carry_backpropagate): Reformat top comment.
>        Add handling of LSHIFTRT and ASHIFTRT.
>        Fix bit count for [SU]MUL_HIGHPART.
>        Fix pasto for [SU]S_ASHIFT.
> 
>      * ext-dce.c: Fixes for carry handling.
>      
>      * ext-dce.c (safe_for_live_propagation): Handle MINUS.
>        (ext_dce_process_uses): Break out carry handling into ..
>        (carry_backpropagate): This new function.
>        Better handling of ASHIFT.
>        Add handling of SMUL_HIGHPART, UMUL_HIGHPART, SIGN_EXTEND, SS_ASHIFT and
>        US_ASHIFT.
> 
>      * ext-dce.c: fix SUBREG_BYTE test
I haven't done an update in a little while.  My tester spun this without 
the vector bits which I'm still pondering.  It did flag one issue.

Specifically on the alpha pr53645.c failed due to the ASHIFTRT handling.


> +    case ASHIFTRT:
> +      if (CONSTANT_P (XEXP (x, 1))
> +	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
> +	{
> +	  HOST_WIDE_INT sign = 0;
> +	  if (HOST_BITS_PER_WIDE_INT - clz_hwi (mask) + INTVAL (XEXP (x, 1))
> +	      > GET_MODE_BITSIZE (mode).to_constant ())
> +	    sign = (1ULL << GET_MODE_BITSIZE (mode).to_constant ()) - 1;
> +	  return sign | (mmask & (mask << INTVAL (XEXP (x, 1))));
> +	}
The "-1" when computing the sign bit is meant to apply to the shift count.

Jeff
diff mbox series

Patch

diff --git a/gcc/ext-dce.cc b/gcc/ext-dce.cc
index 590656f72c7..2a4508181a1 100644
--- a/gcc/ext-dce.cc
+++ b/gcc/ext-dce.cc
@@ -83,6 +83,7 @@  safe_for_live_propagation (rtx_code code)
     case SIGN_EXTEND:
     case TRUNCATE:
     case PLUS:
+    case MINUS:
     case MULT:
     case SMUL_HIGHPART:
     case UMUL_HIGHPART:
@@ -365,6 +366,67 @@  binop_implies_op2_fully_live (rtx_code code)
     }
 }
 
+/* X, with code CODE, is an operation for which
+safe_for_live_propagation holds true,
+   and bits set in MASK are live in the result.  Compute a make of (potentially)
+   live bits in the non-constant inputs.  In case of
+binop_implies_op2_fully_live
+   (e.g. shifts), the computed mask may exclusively pertain to the
+first operand.  */
+
+HOST_WIDE_INT
+carry_backpropagate (HOST_WIDE_INT mask, enum rtx_code code, rtx x)
+{
+  enum machine_mode mode = GET_MODE (x);
+  HOST_WIDE_INT mmask = GET_MODE_MASK (mode);
+  switch (code)
+    {
+    case ASHIFT:
+      if (CONSTANT_P (XEXP (x, 1))
+	  && known_lt (UINTVAL (XEXP (x, 1)), GET_MODE_BITSIZE (mode)))
+	return mask >> INTVAL (XEXP (x, 1));
+      /* Fall through.  */
+    case PLUS: case MINUS:
+    case MULT:
+      return mask ? ((2ULL << floor_log2 (mask)) - 1) : 0;
+    case SMUL_HIGHPART: case UMUL_HIGHPART:
+      if (!mask || XEXP (x, 1) == const0_rtx)
+	return 0;
+      if (CONSTANT_P (XEXP (x, 1)))
+	{
+	  if (pow2p_hwi (INTVAL (XEXP (x, 1))))
+	    return mmask & (mask << (GET_MODE_BITSIZE (mode).to_constant ()
+				     - exact_log2 (INTVAL (XEXP (x, 1)))));
+
+	  int bits = (2 * GET_MODE_BITSIZE (mode).to_constant ()
+		      - clz_hwi (mask) - ctz_hwi (INTVAL (XEXP (x, 1))));
+	  if (bits < GET_MODE_BITSIZE (mode).to_constant ())
+	    return (1ULL << bits) - 1;
+	}
+      return mmask;
+    case SIGN_EXTEND:
+      if (mask & ~mmask)
+	mask |= 1ULL << (GET_MODE_BITSIZE (mode).to_constant () - 1);
+      return mask;
+
+    /* We propagate for the shifted operand, but not the shift
+       count.  The count is handled specially.  */
+    case SS_ASHIFT:
+    case US_ASHIFT:
+      if (!mask || XEXP (x, 1) == const0_rtx)
+	return 0;
+      if (CONSTANT_P (XEXP (x, 1))
+	  && UINTVAL (XEXP (x, 1)) < GET_MODE_BITSIZE (mode).to_constant ())
+	{
+	  return ((mmask & ~((unsigned HOST_WIDE_INT)mmask
+			     >> (INTVAL (XEXP (x, 1)) + (code == SS_ASHIFT))))
+		  | (mask >> INTVAL (XEXP (x, 1))));
+	}
+      return mmask;
+    default:
+      return mask;
+    }
+}
 /* Process uses in INSN contained in OBJ.  Set appropriate bits in LIVENOW
    for any chunks of pseudos that become live, potentially filtering using
    bits from LIVE_TMP.
@@ -480,11 +542,7 @@  ext_dce_process_uses (rtx_insn *insn, rtx obj, bitmap livenow,
 		 sure everything that should get marked as live is marked
 		 from here onward.  */
 
-	      /* ?!? What is the point of this adjustment to DST_MASK?  */
-	      if (code == PLUS || code == MINUS
-		  || code == MULT || code == ASHIFT)
-		dst_mask
-		  = dst_mask ? ((2ULL << floor_log2 (dst_mask)) - 1) : 0;
+	      dst_mask = carry_backpropagate (dst_mask, code, src);
 
 	      /* We will handle the other operand of a binary operator
 		 at the bottom of the loop by resetting Y.  */