diff mbox series

[to-be-committed,RISC-V] Use Zbkb for general 64 bit constants when profitable

Message ID 79a5d4d5-9356-4e49-9f2c-20638f66d267@gmail.com
State New
Headers show
Series [to-be-committed,RISC-V] Use Zbkb for general 64 bit constants when profitable | expand

Commit Message

Jeff Law May 31, 2024, 4:10 a.m. UTC
Basically this adds the ability to generate two independent constants 
during  synthesis, then bring them together with a pack instruction. 
Thus we never need to go out to the constant pool when zbkb is enabled. 
The worst sequence we ever generate is

lui+addi+lui+addi+pack

Obviously if either half can be synthesized with just a lui or just an 
addi, then we'll DTRT automagically.   So for example:

unsigned long foo_0xf857f2def857f2de(void) {
     return 0x1425000028000000;
}

The high and low halves are just a lui.  So the final synthesis is:


>         li      a5,671088640            # 15    [c=4 l=4]  *movdi_64bit/1
>         li      a0,337969152            # 16    [c=4 l=4]  *movdi_64bit/1
>         pack    a0,a5,a0        # 17    [c=12 l=4]  riscv_xpack_di_si_2

On the implementation side, I think the bits I've put in here likely can 
be used to handle the repeating constant case for !zbkb.  I think it 
likely could be used to help capture cases where the upper half can be 
derived from the lower half (say by turning a bit on or off, shifting or 
something similar).  The key in both of these cases is we need a 
temporary register holding an intermediate value.

Ventana's internal tester enables zbkb, but I don't think any of the 
other testers currently exercise zbkb.  We'll probably want to change 
that at some point, but I don't think it's super-critical yet.

While I can envision a few more cases where we could improve constant 
synthesis,   No immediate plans to work in this space, but if someone is 
interested, some thoughts are recorded here:


> https://wiki.riseproject.dev/display/HOME/CT_00_031+--+Additional+Constant+Synthesis+Improvements



Jeff
gcc/
	* config/riscv/riscv.cc (riscv_integer_op): Add new field.
	(riscv_build_integer_1): Initialize the new field.
	(riscv_built_integer): Recognize more cases where Zbkb's
	pack instruction is profitable.
	(riscv_move_integer): Loop over all the codes.  If requested,
	save the current constant into a temporary.  Generate pack
	for more cases using the saved constant.

Comments

Andreas Schwab June 7, 2024, 5:49 p.m. UTC | #1
In file included from ../../gcc/rtl.h:3973,
                 from ../../gcc/config/riscv/riscv.cc:31:
In function 'rtx_def* init_rtx_fmt_ee(rtx, machine_mode, rtx, rtx)',
    inlined from 'rtx_def* gen_rtx_fmt_ee_stat(rtx_code, machine_mode, rtx, rtx)' at ./genrtl.h:50:26,
    inlined from 'void riscv_move_integer(rtx, rtx, long int, machine_mode)' at ../../gcc/config/riscv/riscv.cc:2786:10:
./genrtl.h:37:16: error: 'x' may be used uninitialized [-Werror=maybe-uninitialized]
   37 |   XEXP (rt, 0) = arg0;
../../gcc/config/riscv/riscv.cc: In function 'void riscv_move_integer(rtx, rtx, long int, machine_mode)':
../../gcc/config/riscv/riscv.cc:2723:7: note: 'x' was declared here
 2723 |   rtx x;
      |       ^
cc1plus: all warnings being treated as errors
make[3]: *** [Makefile:2563: riscv.o] Error 1
Jeff Law June 9, 2024, 2:36 p.m. UTC | #2
On 6/7/24 11:49 AM, Andreas Schwab wrote:
> In file included from ../../gcc/rtl.h:3973,
>                   from ../../gcc/config/riscv/riscv.cc:31:
> In function 'rtx_def* init_rtx_fmt_ee(rtx, machine_mode, rtx, rtx)',
>      inlined from 'rtx_def* gen_rtx_fmt_ee_stat(rtx_code, machine_mode, rtx, rtx)' at ./genrtl.h:50:26,
>      inlined from 'void riscv_move_integer(rtx, rtx, long int, machine_mode)' at ../../gcc/config/riscv/riscv.cc:2786:10:
> ./genrtl.h:37:16: error: 'x' may be used uninitialized [-Werror=maybe-uninitialized]
>     37 |   XEXP (rt, 0) = arg0;
> ../../gcc/config/riscv/riscv.cc: In function 'void riscv_move_integer(rtx, rtx, long int, machine_mode)':
> ../../gcc/config/riscv/riscv.cc:2723:7: note: 'x' was declared here
>   2723 |   rtx x;
>        |       ^
> cc1plus: all warnings being treated as errors
Thanks.  I guess the change in control flow in there does hide x's state 
pretty well.  It may not even be provable as initialized without knowing 
how this routine interacts with the costing phase that fills in the codes.

I'll take care of it.

Thanks again,
jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 91fefacee80..10af38a5a81 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -250,6 +250,7 @@  struct riscv_arg_info {
    and each VALUE[i] is a constant integer.  CODE[0] is undefined.  */
 struct riscv_integer_op {
   bool use_uw;
+  bool save_temporary;
   enum rtx_code code;
   unsigned HOST_WIDE_INT value;
 };
@@ -759,6 +760,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
       codes[0].code = UNKNOWN;
       codes[0].value = value;
       codes[0].use_uw = false;
+      codes[0].save_temporary = false;
       return 1;
     }
   if (TARGET_ZBS && SINGLE_BIT_MASK_OPERAND (value))
@@ -767,6 +769,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
       codes[0].code = UNKNOWN;
       codes[0].value = value;
       codes[0].use_uw = false;
+      codes[0].save_temporary = false;
 
       /* RISC-V sign-extends all 32bit values that live in a 32bit
 	 register.  To avoid paradoxes, we thus need to use the
@@ -796,6 +799,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = PLUS;
 	  alt_codes[alt_cost-1].value = low_part;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -810,6 +814,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = XOR;
 	  alt_codes[alt_cost-1].value = low_part;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -852,6 +857,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost-1].code = ASHIFT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = use_uw;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -873,9 +879,11 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  codes[0].value = (((unsigned HOST_WIDE_INT) value >> trailing_ones)
 			    | (value << (64 - trailing_ones)));
 	  codes[0].use_uw = false;
+	  codes[0].save_temporary = false;
 	  codes[1].code = ROTATERT;
 	  codes[1].value = 64 - trailing_ones;
 	  codes[1].use_uw = false;
+	  codes[1].save_temporary = false;
 	  cost = 2;
 	}
       /* Handle the case where the 11 bit range of zero bits wraps around.  */
@@ -888,9 +896,11 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 			    | ((unsigned HOST_WIDE_INT) value
 			       >> (32 + upper_trailing_ones)));
 	  codes[0].use_uw = false;
+	  codes[0].save_temporary = false;
 	  codes[1].code = ROTATERT;
 	  codes[1].value = 32 - upper_trailing_ones;
 	  codes[1].use_uw = false;
+	  codes[1].save_temporary = false;
 	  cost = 2;
 	}
 
@@ -917,6 +927,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	      alt_codes[alt_cost].code = AND;
 	      alt_codes[alt_cost].value = ~(1UL << bit);
 	      alt_codes[alt_cost].use_uw = false;
+	      alt_codes[alt_cost].save_temporary = false;
 	      alt_cost++;
 	      nval &= ~(1UL << bit);
 	    }
@@ -938,6 +949,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 9;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -948,6 +960,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 5;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -958,6 +971,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	   alt_codes[alt_cost - 1].code = FMA;
 	   alt_codes[alt_cost - 1].value = 3;
 	   alt_codes[alt_cost - 1].use_uw = false;
+	   alt_codes[alt_cost - 1].save_temporary = false;
 	   memcpy (codes, alt_codes, sizeof (alt_codes));
 	   cost = alt_cost;
 	}
@@ -978,6 +992,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[alt_cost - 1].code = PLUS;
 	  alt_codes[alt_cost - 1].value = adjustment;
 	  alt_codes[alt_cost - 1].use_uw = false;
+	  alt_codes[alt_cost - 1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -995,6 +1010,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
 	  alt_codes[i].value = value & 0x7ffff000;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~0x7ffff000;
 	   i++;
 	}
@@ -1005,6 +1021,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : PLUS);
 	  alt_codes[i].value = value & 0x7ff;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~0x7ff;
 	  i++;
 	}
@@ -1016,6 +1033,7 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  alt_codes[i].code = (i == 0 ? UNKNOWN : IOR);
 	  alt_codes[i].value = 1UL << bit;
 	  alt_codes[i].use_uw = false;
+	  alt_codes[i].save_temporary = false;
 	  value &= ~(1ULL << bit);
 	  i++;
 	}
@@ -1057,6 +1075,7 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost-1].code = LSHIFTRT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1069,6 +1088,7 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost-1].code = LSHIFTRT;
 	  alt_codes[alt_cost-1].value = shift;
 	  alt_codes[alt_cost-1].use_uw = false;
+	  alt_codes[alt_cost-1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1093,6 +1113,7 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	  alt_codes[alt_cost - 1].code = XOR;
 	  alt_codes[alt_cost - 1].value = -1;
 	  alt_codes[alt_cost - 1].use_uw = false;
+	  alt_codes[alt_cost - 1].save_temporary = false;
 	  memcpy (codes, alt_codes, sizeof (alt_codes));
 	  cost = alt_cost;
 	}
@@ -1128,13 +1149,55 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
   if (cost > 3 && TARGET_64BIT && TARGET_ZBKB)
     {
       unsigned HOST_WIDE_INT loval = value & 0xffffffff;
-      unsigned HOST_WIDE_INT hival = value & ~loval;
-      if (hival >> 32 == loval)
+      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
+      if (hival == loval)
 	{
 	  cost = 1 + riscv_build_integer_1 (codes, sext_hwi (loval, 32), mode);
 	  codes[cost - 1].code = CONCAT;
 	  codes[cost - 1].value = 0;
 	  codes[cost - 1].use_uw = false;
+	  codes[cost - 1].save_temporary = false;
+	}
+
+      /* An arbitrary 64 bit constant can be synthesized in 5 instructions
+	 using zbkb.  We may do better than that if the upper or lower half
+	 can be synthsized with a single LUI, ADDI or BSET.  Regardless the
+	 basic steps are the same.  */
+      if (cost > 3 && can_create_pseudo_p ())
+	{
+	  struct riscv_integer_op hi_codes[RISCV_MAX_INTEGER_OPS];
+	  struct riscv_integer_op lo_codes[RISCV_MAX_INTEGER_OPS];
+	  int hi_cost, lo_cost;
+
+	  /* Synthesize and get cost for each half.  */
+	  lo_cost
+	    = riscv_build_integer_1 (lo_codes, sext_hwi (loval, 32), mode);
+	  hi_cost
+	    = riscv_build_integer_1 (hi_codes, sext_hwi (hival, 32), mode);
+
+	  /* If profitable, finish synthesis using zbkb.  */
+	  if (cost > hi_cost + lo_cost + 1)
+	    {
+	      /* We need the low half independent of the high half.  So
+		 mark it has creating a temporary we'll use later.  */
+	      memcpy (codes, lo_codes,
+		      lo_cost * sizeof (struct riscv_integer_op));
+	      codes[lo_cost - 1].save_temporary = true;
+
+	      /* Now the high half synthesis.  */
+	      memcpy (codes + lo_cost, hi_codes,
+		      hi_cost * sizeof (struct riscv_integer_op));
+
+	      /* Adjust the cost.  */
+	      cost = hi_cost + lo_cost + 1;
+
+	      /* And finally (ab)use VEC_MERGE to indicate we want to
+		 put merge the two parts together.  */
+	      codes[cost - 1].code = VEC_MERGE;
+	      codes[cost - 1].value = 0;
+	      codes[cost - 1].use_uw = false;
+	      codes[cost - 1].save_temporary = false;
+	    }
 	}
 
     }
@@ -2656,23 +2719,25 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
     x = riscv_split_integer (value, mode);
   else
     {
-      codes[0].value = trunc_int_for_mode (codes[0].value, mode);
-      /* Apply each binary operation to X. */
-      x = GEN_INT (codes[0].value);
-
-      for (i = 1; i < num_ops; i++)
+      rtx old_value = NULL_RTX;
+      for (i = 0; i < num_ops; i++)
 	{
-	  if (!can_create_pseudo_p ())
+	  if (i != 0 && !can_create_pseudo_p ())
 	    x = riscv_emit_set (temp, x);
-	  else
+	  else if (i != 0)
 	    x = force_reg (mode, x);
 	  codes[i].value = trunc_int_for_mode (codes[i].value, mode);
-	  /* If the sequence requires using a "uw" form of an insn, we're
-	     going to have to construct the RTL ourselves and put it in
-	     a register to avoid force_reg/force_operand from mucking things
-	     up.  */
-	  if (codes[i].use_uw)
+	  if (codes[i].code == UNKNOWN)
 	    {
+	      /* UNKNOWN means load the constant value into X.  */
+	      x = GEN_INT (codes[i].value);
+	    }
+	  else if (codes[i].use_uw)
+	    {
+	      /* If the sequence requires using a "uw" form of an insn, we're
+		 going to have to construct the RTL ourselves and put it in
+		 a register to avoid force_reg/force_operand from mucking
+		 things up.  */
 	      gcc_assert (TARGET_64BIT || TARGET_ZBA);
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
 
@@ -2695,16 +2760,27 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
 	      x = riscv_emit_set (t, x);
 	    }
-	  else if (codes[i].code == CONCAT)
+	  else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
 	    {
 	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
-	      rtx t2 = gen_lowpart (SImode, x);
+	      rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
+	      gcc_assert (t2);
+	      t2 = gen_lowpart (SImode, t2);
 	      emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
 	      x = t;
 	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
 				x, GEN_INT (codes[i].value));
+
+	  /* If this entry in the code table indicates we should save away
+	     the temporary holding the current value of X, then do so.  */
+	  if (codes[i].save_temporary)
+	    {
+	      gcc_assert (old_value == NULL_RTX);
+	      x = force_reg (mode, x);
+	      old_value = x;
+	    }
 	}
     }