diff mbox series

[to-be-committed,v2,RISC-V] Use bclri in constant synthesis

Message ID 023c8378-f024-494d-9efc-c9c9399d87a4@gmail.com
State New
Headers show
Series [to-be-committed,v2,RISC-V] Use bclri in constant synthesis | expand

Commit Message

Jeff Law May 24, 2024, 4:44 a.m. UTC
Testing with Zbs enabled by default showed a minor logic error.  After 
the loop clearing things with bclri, we can only use the sequence if we 
were able to clear all the necessary bits.  If any bits are still on, 
then the bclr sequence turned out to not be profitable.

--

So this is conceptually similar to how we handled direct generation of
bseti for constant synthesis, but this time for bclr.

In the bclr case, we already have an expander for AND.  So we just
needed to adjust the predicate to accept another class of constant
operands (those with a single bit clear).

With that in place constant synthesis is adjusted so that it counts the
number of bits clear in the high 33 bits of a 64bit word.  If that
number is small relative to the current best cost, then we try to
generate the constant with a lui based sequence for the low half which
implicitly sets the upper 32 bits as well.  Then we bclri one or more of
those upper 33 bits.

So as an example, this code goes from 4 instructions down to 3:

 > unsigned long foo_0xfffffffbfffff7ff(void) { return 
0xfffffffbfffff7ffUL; }



Note the use of 33 bits above.  That's meant to capture cases like this:


 > unsigned long foo_0xfffdffff7ffff7ff(void) { return 
0xfffdffff7ffff7ffUL; }



We can use lui+addi+bclri+bclri to synthesize that in 4 instructions
instead of 5.




I'm including a handful of cases covering the two basic ideas above that
were found by the testing code.

And, no, we're not done yet.  I see at least one more notable idiom
missing before exploring zbkb's potential to improve things.

Tested in my tester and waiting on Rivos CI system before moving forward.
gcc/

	* config/riscv/predicates.md (arith_operand_or_mode_mask): Renamed to..
	(arith_or_mode_mask_or_zbs_operand): New predicate.
	* config/riscv/riscv.md (and<mode>3): Update predicate for operand 2.
	* config/riscv/riscv.cc (riscv_build_integer_1): Use bclri to clear
	bits, particularly bits 31..63 when profitable to do so.

gcc/testsuite/

	* gcc.target/riscv/synthesis-6.c: New test.
diff mbox series

Patch

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index 8948fbfc363..c1c693c7617 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -27,12 +27,6 @@  (define_predicate "arith_operand"
   (ior (match_operand 0 "const_arith_operand")
        (match_operand 0 "register_operand")))
 
-(define_predicate "arith_operand_or_mode_mask"
-  (ior (match_operand 0 "arith_operand")
-       (and (match_code "const_int")
-            (match_test "UINTVAL (op) == GET_MODE_MASK (HImode)
-			 || UINTVAL (op) == GET_MODE_MASK (SImode)"))))
-
 (define_predicate "lui_operand"
   (and (match_code "const_int")
        (match_test "LUI_OPERAND (INTVAL (op))")))
@@ -398,6 +392,14 @@  (define_predicate "not_single_bit_mask_operand"
   (and (match_code "const_int")
        (match_test "SINGLE_BIT_MASK_OPERAND (~UINTVAL (op))")))
 
+(define_predicate "arith_or_mode_mask_or_zbs_operand"
+  (ior (match_operand 0 "arith_operand")
+       (and (match_test "TARGET_ZBS")
+	    (match_operand 0 "not_single_bit_mask_operand"))
+       (and (match_code "const_int")
+            (match_test "UINTVAL (op) == GET_MODE_MASK (HImode)
+			 || UINTVAL (op) == GET_MODE_MASK (SImode)"))))
+
 (define_predicate "const_si_mask_operand"
   (and (match_code "const_int")
        (match_test "(INTVAL (op) & (GET_MODE_BITSIZE (SImode) - 1))
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 85df5b7ab49..3b32b515fac 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -893,6 +893,40 @@  riscv_build_integer_1 (struct riscv_integer_op codes[RISCV_MAX_INTEGER_OPS],
 	  codes[1].use_uw = false;
 	  cost = 2;
 	}
+
+      /* If LUI/ADDI are going to set bits 32..63 and we need a small
+	 number of them cleared, we might be able to use bclri profitably. 
+
+	 Note we may allow clearing of bit 31 using bclri.  There's a class
+	 of constants with that bit clear where this helps.  */
+      else if (TARGET_64BIT
+	       && TARGET_ZBS
+	       && (32 - popcount_hwi (value & HOST_WIDE_INT_C (0xffffffff80000000))) + 1 < cost)
+	{
+	  /* Turn on all those upper bits and synthesize the result.  */
+	  HOST_WIDE_INT nval = value | HOST_WIDE_INT_C (0xffffffff80000000);
+	  alt_cost = riscv_build_integer_1 (alt_codes, nval, mode);
+
+	  /* Now iterate over the bits we want to clear until the cost is
+	     too high or we're done.  */
+	  nval = value ^ HOST_WIDE_INT_C (-1);
+	  nval &= HOST_WIDE_INT_C (~0x7fffffff);
+	  while (nval && alt_cost < cost)
+	    {
+	      HOST_WIDE_INT bit = ctz_hwi (nval);
+	      alt_codes[alt_cost].code = AND;
+	      alt_codes[alt_cost].value = ~(1UL << bit);
+	      alt_codes[alt_cost].use_uw = false;
+	      alt_cost++;
+	      nval &= ~(1UL << bit);
+	    }
+
+	  if (nval == 0 && alt_cost <= cost)
+	    {
+	      memcpy (codes, alt_codes, sizeof (alt_codes));
+	      cost = alt_cost;
+	    }
+	}
     }
 
   if (cost > 2 && TARGET_64BIT && TARGET_ZBA)
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 78c16adee98..1bef1d67efa 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -1648,7 +1648,7 @@  (define_insn "smax<mode>3"
 (define_expand "and<mode>3"
   [(set (match_operand:X                0 "register_operand")
         (and:X (match_operand:X 1 "register_operand")
-                       (match_operand:X 2 "arith_operand_or_mode_mask")))]
+                       (match_operand:X 2 "arith_or_mode_mask_or_zbs_operand")))]
   ""
 {
   /* If the second operand is a mode mask, emit an extension
diff --git a/gcc/testsuite/gcc.target/riscv/synthesis-6.c b/gcc/testsuite/gcc.target/riscv/synthesis-6.c
new file mode 100644
index 00000000000..65cf748f4b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/synthesis-6.c
@@ -0,0 +1,95 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target rv64 } */
+/* We aggressively skip as we really just need to test the basic synthesis
+   which shouldn't vary based on the optimization level.  -O1 seems to work
+   and eliminates the usual sources of extraneous dead code that would throw
+   off the counts.  */
+/* { dg-skip-if "" { *-*-* } { "-O0" "-Og" "-O2" "-O3" "-Os" "-Oz" "-flto" } } */
+/* { dg-options "-march=rv64gc_zba_zbb_zbs" } */
+
+/* Rather than test for a specific synthesis of all these constants or
+   having thousands of tests each testing one variant, we just test the
+   total number of instructions. 
+
+   This isn't expected to change much and any change is worthy of a look.  */
+/* { dg-final { scan-assembler-times "\\t(add|addi|bseti|bclri|li|ret|sh1add|sh2add|sh3add|slli)" 156 } } */
+
+
+unsigned long foo_0xfffffffbfffff7ff(void) { return 0xfffffffbfffff7ffUL; }
+
+unsigned long foo_0xfffffff7fffff7ff(void) { return 0xfffffff7fffff7ffUL; }
+
+unsigned long foo_0xffffffeffffff7ff(void) { return 0xffffffeffffff7ffUL; }
+
+unsigned long foo_0xffffffdffffff7ff(void) { return 0xffffffdffffff7ffUL; }
+
+unsigned long foo_0xffffffbffffff7ff(void) { return 0xffffffbffffff7ffUL; }
+
+unsigned long foo_0xffffff7ffffff7ff(void) { return 0xffffff7ffffff7ffUL; }
+
+unsigned long foo_0xfffffefffffff7ff(void) { return 0xfffffefffffff7ffUL; }
+
+unsigned long foo_0xfffffdfffffff7ff(void) { return 0xfffffdfffffff7ffUL; }
+
+unsigned long foo_0xfffffbfffffff7ff(void) { return 0xfffffbfffffff7ffUL; }
+
+unsigned long foo_0xfffff7fffffff7ff(void) { return 0xfffff7fffffff7ffUL; }
+
+unsigned long foo_0xffffeffffffff7ff(void) { return 0xffffeffffffff7ffUL; }
+
+unsigned long foo_0xffffdffffffff7ff(void) { return 0xffffdffffffff7ffUL; }
+
+unsigned long foo_0xffffbffffffff7ff(void) { return 0xffffbffffffff7ffUL; }
+
+unsigned long foo_0xffff7ffffffff7ff(void) { return 0xffff7ffffffff7ffUL; }
+
+unsigned long foo_0xfffefffffffff7ff(void) { return 0xfffefffffffff7ffUL; }
+
+unsigned long foo_0xfffdfffffffff7ff(void) { return 0xfffdfffffffff7ffUL; }
+
+unsigned long foo_0xfffbfffffffff7ff(void) { return 0xfffbfffffffff7ffUL; }
+
+unsigned long foo_0xfff7fffffffff7ff(void) { return 0xfff7fffffffff7ffUL; }
+
+unsigned long foo_0xffeffffffffff7ff(void) { return 0xffeffffffffff7ffUL; }
+
+unsigned long foo_0xffdffffffffff7ff(void) { return 0xffdffffffffff7ffUL; }
+
+unsigned long foo_0xffbffffffffff7ff(void) { return 0xffbffffffffff7ffUL; }
+
+unsigned long foo_0xff7ffffffffff7ff(void) { return 0xff7ffffffffff7ffUL; }
+
+unsigned long foo_0xfefffffffffff7ff(void) { return 0xfefffffffffff7ffUL; }
+
+unsigned long foo_0xfdfffffffffff7ff(void) { return 0xfdfffffffffff7ffUL; }
+
+unsigned long foo_0xfbfffffffffff7ff(void) { return 0xfbfffffffffff7ffUL; }
+
+unsigned long foo_0xf7fffffffffff7ff(void) { return 0xf7fffffffffff7ffUL; }
+
+unsigned long foo_0xeffffffffffff7ff(void) { return 0xeffffffffffff7ffUL; }
+
+unsigned long foo_0xdffffffffffff7ff(void) { return 0xdffffffffffff7ffUL; }
+
+unsigned long foo_0xbffffffffffff7ff(void) { return 0xbffffffffffff7ffUL; }
+
+unsigned long foo_0xffffffff7fffd7ff(void) { return 0xffffffff7fffd7ffUL; }
+
+unsigned long foo_0xffffffff7ffdf7ff(void) { return 0xffffffff7ffdf7ffUL; }
+
+unsigned long foo_0xffffffff7fdff7ff(void) { return 0xffffffff7fdff7ffUL; }
+
+unsigned long foo_0xffffffff7dfff7ff(void) { return 0xffffffff7dfff7ffUL; }
+
+unsigned long foo_0xffffffff5ffff7ff(void) { return 0xffffffff5ffff7ffUL; }
+
+unsigned long foo_0xfffff7ff7ffff7ff(void) { return 0xfffff7ff7ffff7ffUL; }
+
+unsigned long foo_0xffffdfff7ffff7ff(void) { return 0xffffdfff7ffff7ffUL; }
+
+unsigned long foo_0xffff7fff7ffff7ff(void) { return 0xffff7fff7ffff7ffUL; }
+
+unsigned long foo_0xfffdffff7ffff7ff(void) { return 0xfffdffff7ffff7ffUL; }
+
+
+