diff mbox series

[nvptx] Implement rtx_costs target hook for nvptx backend.

Message ID 001101dad3b2$ef215730$cd640590$@nextmovesoftware.com
State New
Headers show
Series [nvptx] Implement rtx_costs target hook for nvptx backend. | expand

Commit Message

Roger Sayle July 11, 2024, 4:54 p.m. UTC
This patch adds support for TARGET_RTX_COSTS to the nvptx backend.
Currently, nvptx uses GCC's default instruction timing estimates,
but this patch provides (slightly) more accurate timings.  The
most significant difference is that integer division is much slower
(relatively) than other instructions, so the compiler should be
making more use of the middle-end's expand_divmod.

For an example of the benefit consider:

int foo(unsigned int x)
{
  return x/10;
}

currently with -O2 we generate:

.visible .func (.param.u32 %value_out) foo (.param.u32 %in_ar0)
{
        .reg.u32 %value;
        .reg.u32 %ar0;
        ld.param.u32 %ar0, [%in_ar0];
        .reg.u32 %r24;
                mov.u32 %r24, %ar0;
                div.u32 %value, %r24, 10;
        st.param.u32    [%value_out], %value;
        ret;
}

but with this patch, we now generate:

.visible .func (.param.u32 %value_out) foo (.param.u32 %in_ar0)
{
        .reg.u32 %value;
        .reg.u32 %ar0;
        ld.param.u32 %ar0, [%in_ar0];
        .reg.u32 %r24;
        .reg.u32 %r26;
                mov.u32 %r24, %ar0;
                mul.hi.u32      %r26, %r24, -858993459;
                shr.u32 %value, %r26, 3;
        st.param.u32    [%value_out], %value;
        ret;
}

The performance benefits can be seen/measured by the attached
microbenchmark, bench.c, when run with nvptx-none-run-single.

Before:
result = 266546680000
19004366269 ticks
15.203493 seconds

After:
result = 266546680000
5153988012 ticks
4.123190 seconds

So about a 3.7x performance improvement.


This patch has been tested with make and make -k check for nvptx-none
hosted on x86_64-pc-linux-gnu with no new failures.  Ok for mainline?


2024-07-11  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/nvptx/nvptx.cc (nvptx_rtx_size_costs): New function to
        estimate the size of an RTX expression (in ptxas instructions).
        (nvptx_rtx_costs): Implementation of rtx_costs target hook.
        (TARGET_RTX_COSTS): Define to nvptx_rtx_costs.

gcc/testsuite/ChangeLog
        * gcc.target/nvptx/div10.c: New test case.


Thanks in advance,
Roger
--

#include <stdio.h>

unsigned long bench()
{
  unsigned long total = 0;
  for (unsigned int i=0; i<20000; i++)
    for (unsigned int j=0;j<i; j++)
      total += j/5;
  return total;
}

// Nvidia Quadro P400
// #define NVPTX_HZ 1170e6
#define NVPTX_HZ 1250e6

inline unsigned long ticks()
{
    unsigned long now;
    asm volatile("mov.u64 %0, %%clock64;" : "=r"(now));
    return now;
}

int main()
{
  unsigned long beg = ticks();
  unsigned long result = bench();
  unsigned long end = ticks();
  unsigned long delta = end-beg;
  printf("result = %lu\n",result);
  printf("%lu ticks\n",delta);
  printf("%f seconds\n",delta/NVPTX_HZ);
  return 0;
}
diff mbox series

Patch

diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 2a8f713..5ae2a76 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -7511,6 +7511,132 @@  nvptx_goacc_expand_var_decl (tree var)
   return NULL_RTX;
 }
 
+/* Helper function of nvptx_rtx_costs.
+   Assume each ptxas instruction has the same size.  */
+
+static bool
+nvptx_rtx_size_costs (rtx x, machine_mode mode, int outer_code, int *total)
+{
+  int code = GET_CODE (x);
+
+  switch (code)
+    {
+    case PLUS:
+    case MINUS:
+    case MULT:
+    case DIV:
+    case MOD:
+    case FMA:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode
+	  || mode == SFmode || mode == DFmode)
+	*total = COSTS_N_INSNS (1);
+      break;
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ZERO_EXTEND:
+    case UDIV:
+    case UMOD:
+    case ABS:
+    case POPCOUNT:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode)
+	*total = COSTS_N_INSNS (1);
+      break;
+    case SUBREG:
+    case TRUNCATE:
+      *total = COSTS_N_INSNS (1);
+      break;
+    case REG:
+    case CONST_INT:
+    case CONST_DOUBLE:
+      if (outer_code == SET)
+	*total = COSTS_N_INSNS (1);
+      break;
+    }
+  return false;
+}
+
+/* Implement TARGET_RTX_COSTS.  */
+
+static bool
+nvptx_rtx_costs (rtx x, machine_mode mode, int outer_code,
+		 int opno ATTRIBUTE_UNUSED, int *total, bool speed_p)
+{
+  if (! speed_p)
+    return nvptx_rtx_size_costs (x, mode, outer_code, total);
+
+  int code = GET_CODE (x);
+
+  switch (code)
+    {
+    case PLUS:
+    case MINUS:
+    case NEG:
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ZERO_EXTEND:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode
+	  || mode == SFmode || mode == DFmode)
+	*total = COSTS_N_INSNS (1);
+      break;
+    case MULT:
+    case FMA:
+    case UMUL_HIGHPART:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode
+	  || mode == SFmode || mode == DFmode)
+	*total = COSTS_N_INSNS (2);
+      break;
+    case DIV:
+    case MOD:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode)
+	*total = COSTS_N_INSNS (25);
+      else if (mode == SFmode)
+	*total = COSTS_N_INSNS (64);
+      else if (mode == DFmode)
+	*total = COSTS_N_INSNS (90);
+      break;
+    case UDIV:
+    case UMOD:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode)
+	*total = COSTS_N_INSNS (24);
+      break;
+    case ABS:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode)
+	*total = COSTS_N_INSNS (2);
+      break;
+    case POPCOUNT:
+      if (mode == SImode || mode == DImode
+	  || mode == HImode || mode == QImode)
+	*total = COSTS_N_INSNS (4);
+      break;
+    case SUBREG:
+    case TRUNCATE:
+      *total = COSTS_N_INSNS (1);
+      break;
+    case REG:
+    case CONST_INT:
+    case CONST_DOUBLE:
+      *total = (outer_code == SET) ? COSTS_N_INSNS (1) : 0;
+      return true;
+    case UNSPEC:
+      if (XINT (x, 1) == UNSPEC_ARG_REG)
+	{
+	  *total = (outer_code == SET) ? COSTS_N_INSNS (1) : 0;
+	  return true;
+	}
+      break;
+    }
+  return false;
+}
+
 static GTY(()) tree nvptx_previous_fndecl;
 
 static void
@@ -7786,6 +7912,9 @@  nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
 #undef TARGET_GOACC_EXPAND_VAR_DECL
 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
 
+#undef TARGET_RTX_COSTS
+#define TARGET_RTX_COSTS nvptx_rtx_costs
+
 #undef TARGET_SET_CURRENT_FUNCTION
 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
 
diff --git a/gcc/testsuite/gcc.target/nvptx/div10.c b/gcc/testsuite/gcc.target/nvptx/div10.c
new file mode 100644
index 0000000..fce61b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/nvptx/div10.c
@@ -0,0 +1,10 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+int foo(unsigned int x)
+{
+  return x/10;
+}
+
+/* { dg-final { scan-assembler "mul.hi.u32" } } */
+/* { dg-final { scan-assembler "shr.u32" } } */
+/* { dg-final { scan-assembler-not "div.u32" } } */