diff mbox series

[3/3] RISC-V: Constant synthesis of inverted halves

Message ID 20240902200157.328705-3-rzinsly@ventanamicro.com
State New
Headers show
Series [1/3] RISC-V: Improve codegen for negative repeating large constants | expand

Commit Message

Raphael Moreira Zinsly Sept. 2, 2024, 8:01 p.m. UTC
Improve handling of constants where the high half can be constructed by
inverting the lower half.

gcc/ChangeLog:
	* config/riscv/riscv.cc (riscv_build_integer): Detect constants
	were the higher half is the lower half inverted.

gcc/testsuite/ChangeLog:
	* gcc.target/riscv/synthesis-15.c: New test.
---
 gcc/config/riscv/riscv.cc                     | 30 +++++++++++++++++++
 gcc/testsuite/gcc.target/riscv/synthesis-15.c | 26 ++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-15.c

Comments

Jeff Law Sept. 4, 2024, 11:35 p.m. UTC | #1
On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote:
> Improve handling of constants where the high half can be constructed by
> inverting the lower half.
> 
> gcc/ChangeLog:
> 	* config/riscv/riscv.cc (riscv_build_integer): Detect constants
> 	were the higher half is the lower half inverted.
> 
> gcc/testsuite/ChangeLog:
> 	* gcc.target/riscv/synthesis-15.c: New test.
> ---
>   gcc/config/riscv/riscv.cc                     | 30 +++++++++++++++++++
>   gcc/testsuite/gcc.target/riscv/synthesis-15.c | 26 ++++++++++++++++
>   2 files changed, 56 insertions(+)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-15.c
> 
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 64d5611cbd2..9eb62e34b5b 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -1343,6 +1343,36 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
>   		  alt_codes[alt_cost - 1].use_uw = false;
>   		  alt_codes[alt_cost - 1].save_temporary = false;
>   
> +		  memcpy (codes, alt_codes, sizeof (alt_codes));
> +		  cost = alt_cost;
> +		}
> +	    }
> +	}
> +
> +      if (cost > 5 && !bit31)
> +	{
> +	  /* For constants where the upper half is the lower half inverted we can flip
> +	     it with an xor and do a shift 32 followed by an or.  */
> +	  if (hival == (~loval & 0xffffffff))
> +	    {
> +	      alt_cost = 3 + riscv_build_integer_1 (alt_codes,
> +						    sext_hwi (loval, 32), mode);
> +	      if (alt_cost < cost)
> +		{
> +		  alt_codes[alt_cost - 4].save_temporary = true;
> +		  alt_codes[alt_cost - 3].code = XOR;
> +		  alt_codes[alt_cost - 3].value = -1;
> +		  alt_codes[alt_cost - 3].use_uw = false;
> +		  alt_codes[alt_cost - 3].save_temporary = false;
> +		  alt_codes[alt_cost - 2].code = ASHIFT;
> +		  alt_codes[alt_cost - 2].value = 32;
> +		  alt_codes[alt_cost - 2].use_uw = false;
> +		  alt_codes[alt_cost - 2].save_temporary = false;
> +		  alt_codes[alt_cost - 1].code = CONCAT;
> +		  alt_codes[alt_cost - 1].value = 0;
> +		  alt_codes[alt_cost - 1].use_uw = false;
> +		  alt_codes[alt_cost - 1].save_temporary = false;
> +
>   		  memcpy (codes, alt_codes, sizeof (alt_codes));
>   		  cost = alt_cost;
>   		}
> diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-15.c b/gcc/testsuite/gcc.target/riscv/synthesis-15.c
> new file mode 100644
> index 00000000000..eaec6119a72
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/synthesis-15.c
> @@ -0,0 +1,26 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target rv64 } */
> +/* We aggressively skip as we really just need to test the basic synthesis
> +   which shouldn't vary based on the optimization level.  -O1 seems to work
> +   and eliminates the usual sources of extraneous dead code that would throw
> +   off the counts.  */
> +/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
> +/* { dg-options "-march=rv64gc" } */
> +
> +/* Rather than test for a specific synthesis of all these constants or
> +   having thousands of tests each testing one variant, we just test the
> +   total number of instructions.
> +
> +   This isn't expected to change much and any change is worthy of a look.  */
> +/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 60 } } */
> +
> +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; }
> +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; }
> +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; }
> +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; }
> +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; }
> +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; }
> +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; }
> +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; }
> +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; }
> +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; }
I must be missing something.  All the tests have bit31 on.  But I don't 
think this synthesis is valid when bit31 is on and the code seems to 
check this.  What am I missing?

jeff
Raphael Moreira Zinsly Sept. 5, 2024, 12:18 p.m. UTC | #2
On Wed, Sep 4, 2024 at 8:35 PM Jeff Law <jlaw@ventanamicro.com> wrote:
> On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote:
>...
> > +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; }
> > +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; }
> > +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; }
> > +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; }
> > +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; }
> > +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; }
> > +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; }
> > +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; }
> > +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; }
> > +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; }
> I must be missing something.  All the tests have bit31 on.  But I don't
> think this synthesis is valid when bit31 is on and the code seems to
> check this.  What am I missing?

The upper half is the one that is shifted so we check for bit31 of the hival:
bool bit31 = (hival & 0x80000000) != 0;
Maybe we should change the name of the variable to bit63.




--
Raphael Moreira Zinsly
Jeff Law Sept. 5, 2024, 6:40 p.m. UTC | #3
On 9/5/24 6:18 AM, Raphael Zinsly wrote:
> On Wed, Sep 4, 2024 at 8:35 PM Jeff Law <jlaw@ventanamicro.com> wrote:
>> On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote:
>> ...
>>> +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; }
>>> +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; }
>>> +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; }
>>> +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; }
>>> +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; }
>>> +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; }
>>> +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; }
>>> +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; }
>>> +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; }
>>> +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; }
>> I must be missing something.  All the tests have bit31 on.  But I don't
>> think this synthesis is valid when bit31 is on and the code seems to
>> check this.  What am I missing?
> 
> The upper half is the one that is shifted so we check for bit31 of the hival:
> bool bit31 = (hival & 0x80000000) != 0;
> Maybe we should change the name of the variable to bit63.
Ah!  missed that it comes from hival...

But doesn't that highlight the problem.  The lui/addi to construct the 
low bits will sign extend the result out to bit 63 which is why the 
synthesis doesn't work when bit 31 is on.


More concretely for:

unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; }


The resulting code looks like:

>         li      a5,-1258184704
>         addi    a5,a5,-96
>         xori    a0,a5,-1
>         slli    a0,a0,32
>         add     a0,a0,a5

Which I think is wrong.

The problem is $a5 is going to have the sign extended low constant after 
the li+addi:

0xffffffffb5019fa0

We invert that resulting in this value for a0:

0xffffffffb5019fa0

We shift that 32 bits with a new value in a0:

0x4afe605f00000000

So we have these values before the last step.

a5:  0xffffffffb5019fa0
a0:  0x4afe605f00000000

That can't be right.  That's going to result in:

0x4afe605eb5019fa0
          ^
          ^-- that should have been 0xf


The sequence will work if bit31 is off.  Bit 63's value doesn't really 
matter.

It may help from a mental model to remember that the two input values to 
that final PLUS must not have any set bits in common.  We're using a 
PLUS because it's more likely to compress vs an IOR.  If we had used the 
more obvious IOR your sequence would have generated:

0xffffffffb5019fa0

Jeff
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 64d5611cbd2..9eb62e34b5b 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1343,6 +1343,36 @@  riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value,
 		  alt_codes[alt_cost - 1].use_uw = false;
 		  alt_codes[alt_cost - 1].save_temporary = false;
 
+		  memcpy (codes, alt_codes, sizeof (alt_codes));
+		  cost = alt_cost;
+		}
+	    }
+	}
+
+      if (cost > 5 && !bit31)
+	{
+	  /* For constants where the upper half is the lower half inverted we can flip
+	     it with an xor and do a shift 32 followed by an or.  */
+	  if (hival == (~loval & 0xffffffff))
+	    {
+	      alt_cost = 3 + riscv_build_integer_1 (alt_codes,
+						    sext_hwi (loval, 32), mode);
+	      if (alt_cost < cost)
+		{
+		  alt_codes[alt_cost - 4].save_temporary = true;
+		  alt_codes[alt_cost - 3].code = XOR;
+		  alt_codes[alt_cost - 3].value = -1;
+		  alt_codes[alt_cost - 3].use_uw = false;
+		  alt_codes[alt_cost - 3].save_temporary = false;
+		  alt_codes[alt_cost - 2].code = ASHIFT;
+		  alt_codes[alt_cost - 2].value = 32;
+		  alt_codes[alt_cost - 2].use_uw = false;
+		  alt_codes[alt_cost - 2].save_temporary = false;
+		  alt_codes[alt_cost - 1].code = CONCAT;
+		  alt_codes[alt_cost - 1].value = 0;
+		  alt_codes[alt_cost - 1].use_uw = false;
+		  alt_codes[alt_cost - 1].save_temporary = false;
+
 		  memcpy (codes, alt_codes, sizeof (alt_codes));
 		  cost = alt_cost;
 		}
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-15.c b/gcc/testsuite/gcc.target/riscv/synthesis-15.c
new file mode 100644
index 00000000000..eaec6119a72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-15.c
@@ -0,0 +1,26 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions.
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 60 } } */
+
+unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; }
+unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; }
+unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; }
+unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; }
+unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; }
+unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; }
+unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; }
+unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; }
+unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; }
+unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; }