diff mbox series

[1/2,v2] RISC-V: Additional large constant synthesis improvements

Message ID 20240905195042.237016-1-rzinsly@ventanamicro.com
State New
Headers show
Series [1/2,v2] RISC-V: Additional large constant synthesis improvements | expand

Commit Message

Raphael Moreira Zinsly Sept. 5, 2024, 7:50 p.m. UTC
Changes since v1:
	- Fix bit31.
	- Remove negative shift checks.
	- Fix synthesis-7.c expected output.

-- >8 --

Improve handling of large constants in riscv_build_integer, generate
better code for constants where the high half can be constructed
by shifting/shiftNadding the low half or if the halves differ by less
than 2k.

gcc/ChangeLog:
	* config/riscv/riscv.cc (riscv_build_integer): Detect new case
	of constants that can be improved.
	(riscv_move_integer): Add synthesys for concatening constants
	without Zbkb.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/synthesis-7.c: Adjust expected output.
	* gcc.target/riscv/synthesis-12.c: New test.
	* gcc.target/riscv/synthesis-13.c: New test.
	* gcc.target/riscv/synthesis-14.c: New test.
---
 gcc/config/riscv/riscv.cc                     | 138 +++++++++++++++++-
 gcc/testsuite/gcc.target/riscv/synthesis-12.c |  26 ++++
 gcc/testsuite/gcc.target/riscv/synthesis-13.c |  26 ++++
 gcc/testsuite/gcc.target/riscv/synthesis-14.c |  28 ++++
 gcc/testsuite/gcc.target/riscv/synthesis-7.c  |   2 +-
 5 files changed, 213 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-12.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-13.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-14.c

Comments

Jeff Law Sept. 6, 2024, 2:48 a.m. UTC | #1
On 9/5/24 1:50 PM, Raphael Moreira Zinsly wrote:
> Changes since v1:
> 	- Fix bit31.
> 	- Remove negative shift checks.
> 	- Fix synthesis-7.c expected output.
> 
> -- >8 --
> 
> Improve handling of large constants in riscv_build_integer, generate
> better code for constants where the high half can be constructed
> by shifting/shiftNadding the low half or if the halves differ by less
> than 2k.
> 
> gcc/ChangeLog:
> 	* config/riscv/riscv.cc (riscv_build_integer): Detect new case
> 	of constants that can be improved.
> 	(riscv_move_integer): Add synthesys for concatening constants
> 	without Zbkb.
> 
> gcc/testsuite/ChangeLog:
> 	* gcc.target/riscv/synthesis-7.c: Adjust expected output.
> 	* gcc.target/riscv/synthesis-12.c: New test.
> 	* gcc.target/riscv/synthesis-13.c: New test.
> 	* gcc.target/riscv/synthesis-14.c: New test.
> ---
>   gcc/config/riscv/riscv.cc                     | 138 +++++++++++++++++-
>   gcc/testsuite/gcc.target/riscv/synthesis-12.c |  26 ++++
>   gcc/testsuite/gcc.target/riscv/synthesis-13.c |  26 ++++
>   gcc/testsuite/gcc.target/riscv/synthesis-14.c |  28 ++++
>   gcc/testsuite/gcc.target/riscv/synthesis-7.c  |   2 +-
>   5 files changed, 213 insertions(+), 7 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-12.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-13.c
>   create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-14.c
> 
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index a38cb72f09f..df8a5a1c1e2 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -1231,6 +1231,122 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
>   	}
>   
>       }
> +  else if (cost > 4 && TARGET_64BIT && can_create_pseudo_p ()
> +	   && allow_new_pseudos)
> +    {
> +      struct riscv_integer_op alt_codes[RISCV_MAX_INTEGER_OPS];
> +      int alt_cost;
> +
> +      unsigned HOST_WIDE_INT loval = value & 0xffffffff;
> +      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
> +      bool bit31 = (loval & 0x80000000) != 0;
> +      int trailing_shift = ctz_hwi (loval) - ctz_hwi (hival);
> +      int leading_shift = clz_hwi (loval) - clz_hwi (hival);
> +      int shiftval = 0;
> +
> +      /* Adjust the shift into the high half accordingly.  */
> +      if ((trailing_shift > 0 && hival == (loval >> trailing_shift)))
> +	shiftval = 32 - trailing_shift;
> +      else if ((leading_shift > 0 && hival == (loval << leading_shift)))
> +	shiftval = 32 + leading_shift;
> +
> +      if (shiftval && !bit31)
> +	alt_cost = 2 + riscv_build_integer_1 (alt_codes, sext_hwi (loval, 32),
> +					      mode);
> +
> +      /* For constants where the upper half is a shift of the lower half we
> +	 can do a shift followed by an or.  */
> +      if (shiftval && alt_cost < cost && !bit31)
So if shiftval is not zero and bit31 is also not zero, then doesn't this 
test alt_cost < cost without having initialized alt_cost?  I think this 
can be fixed by testing !bit31 before the alt_cost < cost check.




> +	{
> +	  /* We need to save the first constant we build.  */
> +	  alt_codes[alt_cost - 3].save_temporary = true;
> +
> +	  /* Now we want to shift the previously generated constant into the
> +	     high half.  */
> +	  alt_codes[alt_cost - 2].code = ASHIFT;
> +	  alt_codes[alt_cost - 2].value = shiftval;
> +	  alt_codes[alt_cost - 2].use_uw = false;
> +	  alt_codes[alt_cost - 2].save_temporary = false;
> +
> +	  /* And the final step, IOR the two halves together.  Since this uses
> +	     the saved temporary, use CONCAT similar to what we do for Zbkb.  */
> +	  alt_codes[alt_cost - 1].code = CONCAT;
> +	  alt_codes[alt_cost - 1].value = 0;
> +	  alt_codes[alt_cost - 1].use_uw = false;
> +	  alt_codes[alt_cost - 1].save_temporary = false;
> +
> +	  memcpy (codes, alt_codes, sizeof (alt_codes));
> +	  cost = alt_cost;
> +	}
> +
> +      if (cost > 4 && !bit31 && TARGET_ZBA)
> +	{
> +	  int value = 0;
> +
> +	  /* Check for a shNadd.  */
> +	  if (hival == loval * 3)
> +	    value = 3;
> +	  else if (hival == loval * 5)
> +	    value = 5;
> +	  else if (hival == loval * 9)
> +	    value = 9;
> +
> +	  if (value)
> +	    alt_cost = 2 + riscv_build_integer_1 (alt_codes,
> +						  sext_hwi (loval, 32), mode);
> +
> +	  /* For constants where the upper half is a shNadd of the lower half
> +	     we can do a similar transformation.  */
> +	  if (value && alt_cost < cost)
> +	    {
> +	      alt_codes[alt_cost - 3].save_temporary = true;
> +	      alt_codes[alt_cost - 2].code = FMA;
> +	      alt_codes[alt_cost - 2].value = value;
> +	      alt_codes[alt_cost - 2].use_uw = false;
> +	      alt_codes[alt_cost - 2].save_temporary = false;
> +	      alt_codes[alt_cost - 1].code = CONCAT;
> +	      alt_codes[alt_cost - 1].value = 0;
> +	      alt_codes[alt_cost - 1].use_uw = false;
> +	      alt_codes[alt_cost - 1].save_temporary = false;
> +
> +	      memcpy (codes, alt_codes, sizeof (alt_codes));
> +	      cost = alt_cost;
> +	    }
> +	}
> +
> +      if (cost > 4 && !bit31)
> +	{
> +	  int value = hival - loval;
> +
> +	  /* For constants were the halves differ by less than 2048 we can
> +	     generate the upper half by using an addi on the lower half then
> +	     using a shift 32 followed by an or.  */
> +	  if (abs (value) <= 2047)
Using IN_RANGE (value, -2048, 2047) would probably be better and capture 
one more case, -2048 :-)


I know your out of the office, so I'll make those two minor adjustments 
do a quick test and push this to the trunk.


jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index a38cb72f09f..df8a5a1c1e2 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1231,6 +1231,122 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	}
 
     }
+  else if (cost > 4 && TARGET_64BIT && can_create_pseudo_p ()
+	   && allow_new_pseudos)
+    {
+      struct riscv_integer_op alt_codes[RISCV_MAX_INTEGER_OPS];
+      int alt_cost;
+
+      unsigned HOST_WIDE_INT loval = value & 0xffffffff;
+      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
+      bool bit31 = (loval & 0x80000000) != 0;
+      int trailing_shift = ctz_hwi (loval) - ctz_hwi (hival);
+      int leading_shift = clz_hwi (loval) - clz_hwi (hival);
+      int shiftval = 0;
+
+      /* Adjust the shift into the high half accordingly.  */
+      if ((trailing_shift > 0 && hival == (loval >> trailing_shift)))
+	shiftval = 32 - trailing_shift;
+      else if ((leading_shift > 0 && hival == (loval << leading_shift)))
+	shiftval = 32 + leading_shift;
+
+      if (shiftval && !bit31)
+	alt_cost = 2 + riscv_build_integer_1 (alt_codes, sext_hwi (loval, 32),
+					      mode);
+
+      /* For constants where the upper half is a shift of the lower half we
+	 can do a shift followed by an or.  */
+      if (shiftval && alt_cost < cost && !bit31)
+	{
+	  /* We need to save the first constant we build.  */
+	  alt_codes[alt_cost - 3].save_temporary = true;
+
+	  /* Now we want to shift the previously generated constant into the
+	     high half.  */
+	  alt_codes[alt_cost - 2].code = ASHIFT;
+	  alt_codes[alt_cost - 2].value = shiftval;
+	  alt_codes[alt_cost - 2].use_uw = false;
+	  alt_codes[alt_cost - 2].save_temporary = false;
+
+	  /* And the final step, IOR the two halves together.  Since this uses
+	     the saved temporary, use CONCAT similar to what we do for Zbkb.  */
+	  alt_codes[alt_cost - 1].code = CONCAT;
+	  alt_codes[alt_cost - 1].value = 0;
+	  alt_codes[alt_cost - 1].use_uw = false;
+	  alt_codes[alt_cost - 1].save_temporary = false;
+
+	  memcpy (codes, alt_codes, sizeof (alt_codes));
+	  cost = alt_cost;
+	}
+
+      if (cost > 4 && !bit31 && TARGET_ZBA)
+	{
+	  int value = 0;
+
+	  /* Check for a shNadd.  */
+	  if (hival == loval * 3)
+	    value = 3;
+	  else if (hival == loval * 5)
+	    value = 5;
+	  else if (hival == loval * 9)
+	    value = 9;
+
+	  if (value)
+	    alt_cost = 2 + riscv_build_integer_1 (alt_codes,
+						  sext_hwi (loval, 32), mode);
+
+	  /* For constants where the upper half is a shNadd of the lower half
+	     we can do a similar transformation.  */
+	  if (value && alt_cost < cost)
+	    {
+	      alt_codes[alt_cost - 3].save_temporary = true;
+	      alt_codes[alt_cost - 2].code = FMA;
+	      alt_codes[alt_cost - 2].value = value;
+	      alt_codes[alt_cost - 2].use_uw = false;
+	      alt_codes[alt_cost - 2].save_temporary = false;
+	      alt_codes[alt_cost - 1].code = CONCAT;
+	      alt_codes[alt_cost - 1].value = 0;
+	      alt_codes[alt_cost - 1].use_uw = false;
+	      alt_codes[alt_cost - 1].save_temporary = false;
+
+	      memcpy (codes, alt_codes, sizeof (alt_codes));
+	      cost = alt_cost;
+	    }
+	}
+
+      if (cost > 4 && !bit31)
+	{
+	  int value = hival - loval;
+
+	  /* For constants were the halves differ by less than 2048 we can
+	     generate the upper half by using an addi on the lower half then
+	     using a shift 32 followed by an or.  */
+	  if (abs (value) <= 2047)
+	    {
+	      alt_cost = 3 + riscv_build_integer_1 (alt_codes,
+						    sext_hwi (loval, 32), mode);
+	      if (alt_cost < cost)
+		{
+		  alt_codes[alt_cost - 4].save_temporary = true;
+		  alt_codes[alt_cost - 3].code = PLUS;
+		  alt_codes[alt_cost - 3].value = value;
+		  alt_codes[alt_cost - 3].use_uw = false;
+		  alt_codes[alt_cost - 3].save_temporary = false;
+		  alt_codes[alt_cost - 2].code = ASHIFT;
+		  alt_codes[alt_cost - 2].value = 32;
+		  alt_codes[alt_cost - 2].use_uw = false;
+		  alt_codes[alt_cost - 2].save_temporary = false;
+		  alt_codes[alt_cost - 1].code = CONCAT;
+		  alt_codes[alt_cost - 1].value = 0;
+		  alt_codes[alt_cost - 1].use_uw = false;
+		  alt_codes[alt_cost - 1].save_temporary = false;
+
+		  memcpy (codes, alt_codes, sizeof (alt_codes));
+		  cost = alt_cost;
+		}
+	    }
+	}
+    }
 
   return cost;
 }
@@ -2864,12 +2980,22 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	    }
 	  else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
 	    {
-	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
-	      rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
-	      gcc_assert (t2);
-	      t2 = gen_lowpart (SImode, t2);
-	      emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
-	      x = t;
+	      if (codes[i].code == CONCAT && !TARGET_ZBKB)
+		{
+		  /* The two values should have no bits in common, so we can
+		     use PLUS instead of IOR which has a higher chance of
+		     using a compressed instruction.  */
+		  x = gen_rtx_PLUS (mode, x, old_value);
+		}
+	      else
+		{
+		  rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+		  rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
+		  gcc_assert (t2);
+		  t2 = gen_lowpart (SImode, t2);
+		  emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
+		  x = t;
+		}
 	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-12.c b/gcc/testsuite/gcc.target/riscv/synthesis-12.c
new file mode 100644
index 00000000000..bf2f89042a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-12.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 45 } } */
+
+
+unsigned long foo_0x7857f2de7857f2de(void) { return 0x7857f2de7857f2deUL; }
+unsigned long foo_0x7fffdffe3fffefff(void) { return 0x7fffdffe3fffefffUL; }
+unsigned long foo_0x1ffff7fe3fffeffc(void) { return 0x1ffff7fe3fffeffcUL; }
+unsigned long foo_0x0a3fdbf0028ff6fc(void) { return 0x0a3fdbf0028ff6fcUL; }
+unsigned long foo_0x014067e805019fa0(void) { return 0x014067e805019fa0UL; }
+unsigned long foo_0x09d87e90009d87e9(void) { return 0x09d87e90009d87e9UL; }
+unsigned long foo_0x2302320000118119(void) { return 0x2302320000118119UL; }
+unsigned long foo_0x000711eb00e23d60(void) { return 0x000711eb00e23d60UL; }
+unsigned long foo_0x5983800001660e00(void) { return 0x5983800001660e00UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-13.c b/gcc/testsuite/gcc.target/riscv/synthesis-13.c
new file mode 100644
index 00000000000..957410acda1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-13.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 45 } } */
+
+
+unsigned long foo_0x7907d89a2857f2de(void) { return 0x7907d89a2857f2deUL; }
+unsigned long foo_0x4fffaffb0fffefff(void) { return 0x4fffaffb0fffefffUL; }
+unsigned long foo_0x23ff6fdc03ffeffc(void) { return 0x23ff6fdc03ffeffcUL; }
+unsigned long foo_0x170faedc028ff6fc(void) { return 0x170faedc028ff6fcUL; }
+unsigned long foo_0x5704dee01d019fa0(void) { return 0x5704dee01d019fa0UL; }
+unsigned long foo_0x0589c731009d87e9(void) { return 0x0589c731009d87e9UL; }
+unsigned long foo_0x0057857d00118119(void) { return 0x0057857d00118119UL; }
+unsigned long foo_0x546b32e010e23d60(void) { return 0x546b32e010e23d60UL; }
+unsigned long foo_0x64322a0021660e00(void) { return 0x64322a0021660e00UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-14.c b/gcc/testsuite/gcc.target/riscv/synthesis-14.c
new file mode 100644
index 00000000000..bd4e4afa55a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-14.c
@@ -0,0 +1,28 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 65 } } */
+
+
+unsigned long foo_0x7857faae7857f2de(void) { return 0x7857faae7857f2deUL; }
+unsigned long foo_0x0ffff7fe0fffefff(void) { return 0x0ffff7fe0fffefffUL; }
+unsigned long foo_0x7857f2de7857faae(void) { return 0x7857f2de7857faaeUL; }
+unsigned long foo_0x7857f2af7857faae(void) { return 0x7857f2af7857faaeUL; }
+unsigned long foo_0x5fbfffff5fbffae5(void) { return 0x5fbfffff5fbffae5UL; }
+unsigned long foo_0x3d3079db3d3079ac(void) { return 0x3d3079db3d3079acUL; }
+unsigned long foo_0x046075fe046078a8(void) { return 0x046075fe046078a8UL; }
+unsigned long foo_0x2411811a24118119(void) { return 0x2411811a24118119UL; }
+unsigned long foo_0x70e23d6a70e23d6b(void) { return 0x70e23d6a70e23d6bUL; }
+unsigned long foo_0x0c01df8c0c01df7d(void) { return 0x0c01df8c0c01df7dUL; }
+unsigned long foo_0x7fff07d07fff0000(void) { return 0x7fff07d07fff0000UL; }
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-7.c b/gcc/testsuite/gcc.target/riscv/synthesis-7.c
index 5a69d2e5f21..c71c3cd408a 100644
--- a/gcc/testsuite/gcc.target/riscv/synthesis-7.c
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-7.c
@@ -12,7 +12,7 @@ 
    total number of instructions.
 
    This isn't expected to change much and any change is worthy of a look.  */
-/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli)" 7571 } } */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|ret|sh1add|sh2add|sh3add|slli|bclri)" 7575 } } */
 
 unsigned long foo_0xfffff7fe7ffff7ff(void) { return 0xfffff7fe7ffff7ffUL; }
 unsigned long foo_0xffffeffe7ffff7ff(void) { return 0xffffeffe7ffff7ffUL; }