diff mbox series

[1/2] RISC-V: Constant synthesis with same upper and lower halves

Message ID 20240808171010.16216-1-rzinsly@ventanamicro.com
State New
Headers show
Series [1/2] RISC-V: Constant synthesis with same upper and lower halves | expand

Commit Message

Raphael Moreira Zinsly Aug. 8, 2024, 5:10 p.m. UTC
From: Raphael Zinsly <rzinsly@ventanamicro.com>

Improve handling of constants where its upper and lower 32-bit
halves are the same and Zbkb is not available in riscv_move_integer.
riscv_split_integer already handles this but the changes in
riscv_build_integer makes it possible to improve code generation for
negative values.

e.g. for:

unsigned long f (void) { return 0xf857f2def857f2deUL; }

Without the patch:

li      a0,-128454656
addi    a0,a0,734
li      a5,-128454656
addi    a5,a5,735
slli    a5,a5,32
add     a0,a5,a0

With the patch:

li      a0,128454656
addi    a0,a0,-735
slli    a5,a0,32
add     a0,a0,a5
xori    a0,a0,-1

gcc/ChangeLog:
	* config/riscv/riscv.cc (riscv_build_integer): Detect constants
	with the same 32-bit halves and without Zbkb.
	(riscv_move_integer): Add synthesys of these constants.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/synthesis-11.c: New test.

	Co-authored-by: Jeff Law <jlaw@ventanamicro.com>
---
 gcc/config/riscv/riscv.cc                     | 59 +++++++++++++++++--
 gcc/testsuite/gcc.target/riscv/synthesis-11.c | 40 +++++++++++++
 2 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-11.c

Comments

Vineet Gupta Aug. 9, 2024, 11:51 p.m. UTC | #1
Hi Raphael,

On 8/8/24 10:10, Raphael Moreira Zinsly wrote:
> From: Raphael Zinsly <rzinsly@ventanamicro.com>
>
> Improve handling of constants where its upper and lower 32-bit
> halves are the same and Zbkb is not available in riscv_move_integer.
> riscv_split_integer already handles this but the changes in
> riscv_build_integer makes it possible to improve code generation for
> negative values.

But...

> e.g. for:
>
> unsigned long f (void) { return 0xf857f2def857f2deUL; }
>
> Without the patch:
>
> li      a0,-128454656
> addi    a0,a0,734
> li      a5,-128454656
> addi    a5,a5,735
> slli    a5,a5,32
> add     a0,a5,a0
>
> With the patch:
>
> li      a0,128454656
> addi    a0,a0,-735
> slli    a5,a0,32
> add     a0,a0,a5
> xori    a0,a0,-1

[snip...]

>      }
> +  else if (cost > 3 && TARGET_64BIT && can_create_pseudo_p ())
> +    {
> +      struct riscv_integer_op alt_codes[RISCV_MAX_INTEGER_OPS];
> +      int alt_cost;
> +
> +      unsigned HOST_WIDE_INT loval = value & 0xffffffff;
> +      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
> +      bool bit31 = (hival & 0x80000000) != 0;
> +      /* Without pack we can generate it with a shift 32 followed by an or.  */
> +      if (hival == loval && !bit31)
> +	{
> +	  alt_cost = 2 + riscv_build_integer_1 (alt_codes,
> +						sext_hwi (loval, 32), mode);
> +	  if (alt_cost < cost)
> +	    {
> +	      /* We need to save the first constant we build.  */
> +	      alt_codes[alt_cost - 3].save_temporary = true;
> +
> +	      /* Now we want to shift the previously generated constant into the
> +		 high half.  */
> +	      alt_codes[alt_cost - 2].code = ASHIFT;
> +	      alt_codes[alt_cost - 2].value = 32;
> +	      alt_codes[alt_cost - 2].use_uw = false;
> +	      alt_codes[alt_cost - 2].save_temporary = false;
> +
> +	      /* And the final step, IOR the two halves together.  Since this uses
> +		 the saved temporary, use CONCAT similar to what we do for Zbkb.  */
> +	      alt_codes[alt_cost - 1].code = CONCAT;
> +	      alt_codes[alt_cost - 1].value = 0;
> +	      alt_codes[alt_cost - 1].use_uw = false;
> +	      alt_codes[alt_cost - 1].save_temporary = false;
> +
> +	      memcpy (codes, alt_codes, sizeof (alt_codes));
> +	      cost = alt_cost;
> +	    }
> +	}
> +    }

As you mentioned above: positive values are already handled in riscv_split_integer after
c104ef4b5eb1 ("RISC-V: improve codegen for large constants with same 32-bit lo and hi")parts [2]

Meaning this is redundant with the code from above commit.
If you prefer these changes (in light of patch 2/2 and/or zbkb changes tending to this code) IMO that code should be removed.

FWIW at the time I did try (in vain) to handle negative values too but my change was broken [3]

[3] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623365.html

-Vineet

>    return cost;
>  }
> @@ -2786,12 +2823,22 @@ riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
>  	    }
>  	  else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
>  	    {
> -	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
> -	      rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
> -	      gcc_assert (t2);
> -	      t2 = gen_lowpart (SImode, t2);
> -	      emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
> -	      x = t;
> +	      if (codes[i].code == CONCAT && !TARGET_ZBKB)
> +		{
> +		  /* The two values should have no bits in common, so we can
> +		     use PLUS instead of IOR which has a higher chance of
> +		     using a compressed instruction.  */
> +		  x = gen_rtx_PLUS (mode, x, old_value);
> +		}
> +	      else
> +		{
> +		  rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
> +		  rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
> +		  gcc_assert (t2);
> +		  t2 = gen_lowpart (SImode, t2);
> +		  emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
> +		  x = t;
> +		}
>  	    }
>  	  else
>  	    x = gen_rtx_fmt_ee (codes[i].code, mode,
> diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-11.c b/gcc/testsuite/gcc.target/riscv/synthesis-11.c
> new file mode 100644
> index 00000000000..98401d5ca32
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/synthesis-11.c
> @@ -0,0 +1,40 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target rv64 } */
> +/* We aggressively skip as we really just need to test the basic synthesis
> +   which shouldn't vary based on the optimization level.  -O1 seems to work
> +   and eliminates the usual sources of extraneous dead code that would throw
> +   off the counts.  */
> +/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
> +/* { dg-options "-march=rv64gc" } */
> +
> +/* Rather than test for a specific synthesis of all these constants or
> +   having thousands of tests each testing one variant, we just test the
> +   total number of instructions.
> +
> +   This isn't expected to change much and any change is worthy of a look.  */
> +/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 114 } } */
> +
> +
> +
> +unsigned long foo_0x7857f2de7857f2de(void) { return 0x7857f2de7857f2deUL; }
> +unsigned long foo_0x19660e6319660e63(void) { return 0x19660e6319660e63UL; }
> +unsigned long foo_0x137f1b75137f1b75(void) { return 0x137f1b75137f1b75UL; }
> +unsigned long foo_0x35019fa035019fa0(void) { return 0x35019fa035019fa0UL; }
> +unsigned long foo_0x3828e6c13828e6c1(void) { return 0x3828e6c13828e6c1UL; }
> +unsigned long foo_0x039d87e9039d87e9(void) { return 0x039d87e9039d87e9UL; }
> +unsigned long foo_0x429617c1429617c1(void) { return 0x429617c1429617c1UL; }
> +unsigned long foo_0x2411811924118119(void) { return 0x2411811924118119UL; }
> +unsigned long foo_0x0c01df7d0c01df7d(void) { return 0x0c01df7d0c01df7dUL; }
> +unsigned long foo_0x70e23d6b70e23d6b(void) { return 0x70e23d6b70e23d6bUL; }
> +unsigned long foo_0xf857f2def857f2de(void) { return 0xf857f2def857f2deUL; }
> +unsigned long foo_0x99660e6399660e63(void) { return 0x99660e6399660e63UL; }
> +unsigned long foo_0x937f1b75937f1b75(void) { return 0x937f1b75937f1b75UL; }
> +unsigned long foo_0xb5019fa0b5019fa0(void) { return 0xb5019fa0b5019fa0UL; }
> +unsigned long foo_0xb828e6c1b828e6c1(void) { return 0xb828e6c1b828e6c1UL; }
> +unsigned long foo_0x839d87e9839d87e9(void) { return 0x839d87e9839d87e9UL; }
> +unsigned long foo_0xc29617c1c29617c1(void) { return 0xc29617c1c29617c1UL; }
> +unsigned long foo_0xa4118119a4118119(void) { return 0xa4118119a4118119UL; }
> +unsigned long foo_0x8c01df7d8c01df7d(void) { return 0x8c01df7d8c01df7dUL; }
> +unsigned long foo_0xf0e23d6bf0e23d6b(void) { return 0xf0e23d6bf0e23d6bUL; }
> +unsigned long foo_0x7fff00007fff0000(void) { return 0x7fff00007fff0000UL; }
> +
Jeff Law Aug. 12, 2024, 5:55 p.m. UTC | #2
On 8/9/24 5:51 PM, Vineet Gupta wrote:

> 
> As you mentioned above: positive values are already handled in riscv_split_integer after
> c104ef4b5eb1 ("RISC-V: improve codegen for large constants with same 32-bit lo and hi")parts [2]
Weird. I would have swore we weren't seeing good code for those cases. 
But checking now it looks happy.

> Meaning this is redundant with the code from above commit.
> If you prefer these changes (in light of patch 2/2 and/or zbkb changes tending to this code) IMO that code should be removed.
No strong opinion on which to keep.  Given yours are in the tree, let's 
keep yours unless we can't find a way to handle the negative constant case.

For negative constants (without zbkb), I think the desired synthesis 
would be to first generate the inverted constant which must be positive. 
  That's going to result in a 4 instruction sequence most of the time. 
Then bit flip the result.  That's likely still faster than the constant 
pool approach and fewer instructions than the current synthesis where we 
generate something like:


So let's withdraw this patch and refocus on the negative case.

jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 8ece7859945..454220d8ba4 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1223,6 +1223,43 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 	}
 
     }
+  else if (cost > 3 && TARGET_64BIT && can_create_pseudo_p ())
+    {
+      struct riscv_integer_op alt_codes[RISCV_MAX_INTEGER_OPS];
+      int alt_cost;
+
+      unsigned HOST_WIDE_INT loval = value & 0xffffffff;
+      unsigned HOST_WIDE_INT hival = (value & ~loval) >> 32;
+      bool bit31 = (hival & 0x80000000) != 0;
+      /* Without pack we can generate it with a shift 32 followed by an or.  */
+      if (hival == loval && !bit31)
+	{
+	  alt_cost = 2 + riscv_build_integer_1 (alt_codes,
+						sext_hwi (loval, 32), mode);
+	  if (alt_cost < cost)
+	    {
+	      /* We need to save the first constant we build.  */
+	      alt_codes[alt_cost - 3].save_temporary = true;
+
+	      /* Now we want to shift the previously generated constant into the
+		 high half.  */
+	      alt_codes[alt_cost - 2].code = ASHIFT;
+	      alt_codes[alt_cost - 2].value = 32;
+	      alt_codes[alt_cost - 2].use_uw = false;
+	      alt_codes[alt_cost - 2].save_temporary = false;
+
+	      /* And the final step, IOR the two halves together.  Since this uses
+		 the saved temporary, use CONCAT similar to what we do for Zbkb.  */
+	      alt_codes[alt_cost - 1].code = CONCAT;
+	      alt_codes[alt_cost - 1].value = 0;
+	      alt_codes[alt_cost - 1].use_uw = false;
+	      alt_codes[alt_cost - 1].save_temporary = false;
+
+	      memcpy (codes, alt_codes, sizeof (alt_codes));
+	      cost = alt_cost;
+	    }
+	}
+    }
 
   return cost;
 }
@@ -2786,12 +2823,22 @@  riscv_move_integer (rtx temp, rtx dest, HOST_WIDE_INT value,
 	    }
 	  else if (codes[i].code == CONCAT || codes[i].code == VEC_MERGE)
 	    {
-	      rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
-	      rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
-	      gcc_assert (t2);
-	      t2 = gen_lowpart (SImode, t2);
-	      emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
-	      x = t;
+	      if (codes[i].code == CONCAT && !TARGET_ZBKB)
+		{
+		  /* The two values should have no bits in common, so we can
+		     use PLUS instead of IOR which has a higher chance of
+		     using a compressed instruction.  */
+		  x = gen_rtx_PLUS (mode, x, old_value);
+		}
+	      else
+		{
+		  rtx t = can_create_pseudo_p () ? gen_reg_rtx (mode) : temp;
+		  rtx t2 = codes[i].code == VEC_MERGE ? old_value : x;
+		  gcc_assert (t2);
+		  t2 = gen_lowpart (SImode, t2);
+		  emit_insn (gen_riscv_xpack_di_si_2 (t, x, GEN_INT (32), t2));
+		  x = t;
+		}
 	    }
 	  else
 	    x = gen_rtx_fmt_ee (codes[i].code, mode,
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-11.c b/gcc/testsuite/gcc.target/riscv/synthesis-11.c
new file mode 100644
index 00000000000..98401d5ca32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-11.c
@@ -0,0 +1,40 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 114 } } */
+
+
+
+unsigned long foo_0x7857f2de7857f2de(void) { return 0x7857f2de7857f2deUL; }
+unsigned long foo_0x19660e6319660e63(void) { return 0x19660e6319660e63UL; }
+unsigned long foo_0x137f1b75137f1b75(void) { return 0x137f1b75137f1b75UL; }
+unsigned long foo_0x35019fa035019fa0(void) { return 0x35019fa035019fa0UL; }
+unsigned long foo_0x3828e6c13828e6c1(void) { return 0x3828e6c13828e6c1UL; }
+unsigned long foo_0x039d87e9039d87e9(void) { return 0x039d87e9039d87e9UL; }
+unsigned long foo_0x429617c1429617c1(void) { return 0x429617c1429617c1UL; }
+unsigned long foo_0x2411811924118119(void) { return 0x2411811924118119UL; }
+unsigned long foo_0x0c01df7d0c01df7d(void) { return 0x0c01df7d0c01df7dUL; }
+unsigned long foo_0x70e23d6b70e23d6b(void) { return 0x70e23d6b70e23d6bUL; }
+unsigned long foo_0xf857f2def857f2de(void) { return 0xf857f2def857f2deUL; }
+unsigned long foo_0x99660e6399660e63(void) { return 0x99660e6399660e63UL; }
+unsigned long foo_0x937f1b75937f1b75(void) { return 0x937f1b75937f1b75UL; }
+unsigned long foo_0xb5019fa0b5019fa0(void) { return 0xb5019fa0b5019fa0UL; }
+unsigned long foo_0xb828e6c1b828e6c1(void) { return 0xb828e6c1b828e6c1UL; }
+unsigned long foo_0x839d87e9839d87e9(void) { return 0x839d87e9839d87e9UL; }
+unsigned long foo_0xc29617c1c29617c1(void) { return 0xc29617c1c29617c1UL; }
+unsigned long foo_0xa4118119a4118119(void) { return 0xa4118119a4118119UL; }
+unsigned long foo_0x8c01df7d8c01df7d(void) { return 0x8c01df7d8c01df7dUL; }
+unsigned long foo_0xf0e23d6bf0e23d6b(void) { return 0xf0e23d6bf0e23d6bUL; }
+unsigned long foo_0x7fff00007fff0000(void) { return 0x7fff00007fff0000UL; }
+