Message ID | 20240902200157.328705-3-rzinsly@ventanamicro.com |
---|---|
State | New |
Headers | show |
Series | [1/3] RISC-V: Improve codegen for negative repeating large constants | expand |
On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote: > Improve handling of constants where the high half can be constructed by > inverting the lower half. > > gcc/ChangeLog: > * config/riscv/riscv.cc (riscv_build_integer): Detect constants > were the higher half is the lower half inverted. > > gcc/testsuite/ChangeLog: > * gcc.target/riscv/synthesis-15.c: New test. > --- > gcc/config/riscv/riscv.cc | 30 +++++++++++++++++++ > gcc/testsuite/gcc.target/riscv/synthesis-15.c | 26 ++++++++++++++++ > 2 files changed, 56 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/synthesis-15.c > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index 64d5611cbd2..9eb62e34b5b 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -1343,6 +1343,36 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, > alt_codes[alt_cost - 1].use_uw = false; > alt_codes[alt_cost - 1].save_temporary = false; > > + memcpy (codes, alt_codes, sizeof (alt_codes)); > + cost = alt_cost; > + } > + } > + } > + > + if (cost > 5 && !bit31) > + { > + /* For constants where the upper half is the lower half inverted we can flip > + it with an xor and do a shift 32 followed by an or. */ > + if (hival == (~loval & 0xffffffff)) > + { > + alt_cost = 3 + riscv_build_integer_1 (alt_codes, > + sext_hwi (loval, 32), mode); > + if (alt_cost < cost) > + { > + alt_codes[alt_cost - 4].save_temporary = true; > + alt_codes[alt_cost - 3].code = XOR; > + alt_codes[alt_cost - 3].value = -1; > + alt_codes[alt_cost - 3].use_uw = false; > + alt_codes[alt_cost - 3].save_temporary = false; > + alt_codes[alt_cost - 2].code = ASHIFT; > + alt_codes[alt_cost - 2].value = 32; > + alt_codes[alt_cost - 2].use_uw = false; > + alt_codes[alt_cost - 2].save_temporary = false; > + alt_codes[alt_cost - 1].code = CONCAT; > + alt_codes[alt_cost - 1].value = 0; > + alt_codes[alt_cost - 1].use_uw = false; > + alt_codes[alt_cost - 1].save_temporary = false; > + > memcpy (codes, alt_codes, sizeof (alt_codes)); > cost = alt_cost; > } > diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-15.c b/gcc/testsuite/gcc.target/riscv/synthesis-15.c > new file mode 100644 > index 00000000000..eaec6119a72 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/synthesis-15.c > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target rv64 } */ > +/* We aggressively skip as we really just need to test the basic synthesis > + which shouldn't vary based on the optimization level. -O1 seems to work > + and eliminates the usual sources of extraneous dead code that would throw > + off the counts. */ > +/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */ > +/* { dg-options "-march=rv64gc" } */ > + > +/* Rather than test for a specific synthesis of all these constants or > + having thousands of tests each testing one variant, we just test the > + total number of instructions. > + > + This isn't expected to change much and any change is worthy of a look. */ > +/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 60 } } */ > + > +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; } > +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; } > +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; } > +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; } > +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; } > +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; } > +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; } > +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; } > +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; } > +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; } I must be missing something. All the tests have bit31 on. But I don't think this synthesis is valid when bit31 is on and the code seems to check this. What am I missing? jeff
On Wed, Sep 4, 2024 at 8:35 PM Jeff Law <jlaw@ventanamicro.com> wrote: > On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote: >... > > +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; } > > +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; } > > +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; } > > +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; } > > +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; } > > +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; } > > +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; } > > +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; } > > +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; } > > +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; } > I must be missing something. All the tests have bit31 on. But I don't > think this synthesis is valid when bit31 is on and the code seems to > check this. What am I missing? The upper half is the one that is shifted so we check for bit31 of the hival: bool bit31 = (hival & 0x80000000) != 0; Maybe we should change the name of the variable to bit63. -- Raphael Moreira Zinsly
On 9/5/24 6:18 AM, Raphael Zinsly wrote: > On Wed, Sep 4, 2024 at 8:35 PM Jeff Law <jlaw@ventanamicro.com> wrote: >> On 9/2/24 2:01 PM, Raphael Moreira Zinsly wrote: >> ... >>> +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; } >>> +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; } >>> +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; } >>> +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; } >>> +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; } >>> +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; } >>> +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; } >>> +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; } >>> +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; } >>> +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; } >> I must be missing something. All the tests have bit31 on. But I don't >> think this synthesis is valid when bit31 is on and the code seems to >> check this. What am I missing? > > The upper half is the one that is shifted so we check for bit31 of the hival: > bool bit31 = (hival & 0x80000000) != 0; > Maybe we should change the name of the variable to bit63. Ah! missed that it comes from hival... But doesn't that highlight the problem. The lui/addi to construct the low bits will sign extend the result out to bit 63 which is why the synthesis doesn't work when bit 31 is on. More concretely for: unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; } The resulting code looks like: > li a5,-1258184704 > addi a5,a5,-96 > xori a0,a5,-1 > slli a0,a0,32 > add a0,a0,a5 Which I think is wrong. The problem is $a5 is going to have the sign extended low constant after the li+addi: 0xffffffffb5019fa0 We invert that resulting in this value for a0: 0xffffffffb5019fa0 We shift that 32 bits with a new value in a0: 0x4afe605f00000000 So we have these values before the last step. a5: 0xffffffffb5019fa0 a0: 0x4afe605f00000000 That can't be right. That's going to result in: 0x4afe605eb5019fa0 ^ ^-- that should have been 0xf The sequence will work if bit31 is off. Bit 63's value doesn't really matter. It may help from a mental model to remember that the two input values to that final PLUS must not have any set bits in common. We're using a PLUS because it's more likely to compress vs an IOR. If we had used the more obvious IOR your sequence would have generated: 0xffffffffb5019fa0 Jeff
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 64d5611cbd2..9eb62e34b5b 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -1343,6 +1343,36 @@ riscv_build_integer (struct riscv_integer_op *codes, HOST_WIDE_INT value, alt_codes[alt_cost - 1].use_uw = false; alt_codes[alt_cost - 1].save_temporary = false; + memcpy (codes, alt_codes, sizeof (alt_codes)); + cost = alt_cost; + } + } + } + + if (cost > 5 && !bit31) + { + /* For constants where the upper half is the lower half inverted we can flip + it with an xor and do a shift 32 followed by an or. */ + if (hival == (~loval & 0xffffffff)) + { + alt_cost = 3 + riscv_build_integer_1 (alt_codes, + sext_hwi (loval, 32), mode); + if (alt_cost < cost) + { + alt_codes[alt_cost - 4].save_temporary = true; + alt_codes[alt_cost - 3].code = XOR; + alt_codes[alt_cost - 3].value = -1; + alt_codes[alt_cost - 3].use_uw = false; + alt_codes[alt_cost - 3].save_temporary = false; + alt_codes[alt_cost - 2].code = ASHIFT; + alt_codes[alt_cost - 2].value = 32; + alt_codes[alt_cost - 2].use_uw = false; + alt_codes[alt_cost - 2].save_temporary = false; + alt_codes[alt_cost - 1].code = CONCAT; + alt_codes[alt_cost - 1].value = 0; + alt_codes[alt_cost - 1].use_uw = false; + alt_codes[alt_cost - 1].save_temporary = false; + memcpy (codes, alt_codes, sizeof (alt_codes)); cost = alt_cost; } diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-15.c b/gcc/testsuite/gcc.target/riscv/synthesis-15.c new file mode 100644 index 00000000000..eaec6119a72 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/synthesis-15.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target rv64 } */ +/* We aggressively skip as we really just need to test the basic synthesis + which shouldn't vary based on the optimization level. -O1 seems to work + and eliminates the usual sources of extraneous dead code that would throw + off the counts. */ +/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */ +/* { dg-options "-march=rv64gc" } */ + +/* Rather than test for a specific synthesis of all these constants or + having thousands of tests each testing one variant, we just test the + total number of instructions. + + This isn't expected to change much and any change is worthy of a look. */ +/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|li|pack|ret|sh1add|sh2add|sh3add|slli|srli|xori|or)" 60 } } */ + +unsigned long foo_0x4afe605fb5019fa0(void) { return 0x4afe605fb5019fa0UL; } +unsigned long foo_0x07a80d21f857f2de(void) { return 0x07a80d21f857f2deUL; } +unsigned long foo_0x6699f19c99660e63(void) { return 0x6699f19c99660e63UL; } +unsigned long foo_0x6c80e48a937f1b75(void) { return 0x6c80e48a937f1b75UL; } +unsigned long foo_0x47d7193eb828e6c1(void) { return 0x47d7193eb828e6c1UL; } +unsigned long foo_0x7c627816839d87e9(void) { return 0x7c627816839d87e9UL; } +unsigned long foo_0x3d69e83ec29617c1(void) { return 0x3d69e83ec29617c1UL; } +unsigned long foo_0x5bee7ee6a4118119(void) { return 0x5bee7ee6a4118119UL; } +unsigned long foo_0x73fe20828c01df7d(void) { return 0x73fe20828c01df7dUL; } +unsigned long foo_0x0f1dc294f0e23d6b(void) { return 0x0f1dc294f0e23d6bUL; }