diff mbox series

[17/17,APX,NDD] Support TImode shift for NDD

Message ID 20231205022948.504790-18-hongyu.wang@intel.com
State New
Headers show
Series Support Intel APX NDD | expand

Commit Message

Hongyu Wang Dec. 5, 2023, 2:29 a.m. UTC
For TImode shifts, they are splitted by splitter functions, which assume
operands[0] and operands[1] to be the same. For the NDD alternative the
assumption may not be true so add split functions for NDD to emit the NDD
form instructions, and omit the handling of !64bit target split.

Although the NDD form allows memory src, for post-reload splitter there are
no extra register to accept NDD form shift, especially shld/shrd. So only
accept register alternative for shift src under NDD.

gcc/ChangeLog:

	* config/i386/i386-expand.cc (ix86_split_ashl_ndd): New
	function to split NDD form lshift.
	(ix86_split_rshift_ndd): Likewise for l/ashiftrt.
	* config/i386/i386-protos.h (ix86_split_ashl_ndd): New
	prototype.
	(ix86_split_rshift_ndd): Likewise.
	* config/i386/i386.md (ashl<mode>3_doubleword): Add NDD
	alternative, call ndd split function when operands[0]
	not equal to operands[1].
	(define_split for doubleword lshift): Likewise.
	(define_peephole for doubleword lshift): Likewise.
	(<insn><mode>3_doubleword): Likewise for l/ashiftrt.
	(define_split for doubleword l/ashiftrt): Likewise.
	(define_peephole for doubleword l/ashiftrt): Likewise.

gcc/ChangeLog:

	* gcc.target/i386/apx-ndd-ti-shift.c: New test.
---
 gcc/config/i386/i386-expand.cc                | 136 ++++++++++++++++++
 gcc/config/i386/i386-protos.h                 |   2 +
 gcc/config/i386/i386.md                       |  56 ++++++--
 .../gcc.target/i386/apx-ndd-ti-shift.c        |  91 ++++++++++++
 4 files changed, 273 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d4bbd33ce07..a53d69d5400 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -6678,6 +6678,142 @@  ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
     }
 }
 
+/* Helper function to split TImode ashl under NDD.  */
+void
+ix86_split_ashl_ndd (rtx *operands, rtx scratch)
+{
+  gcc_assert (TARGET_APX_NDD);
+  int half_width = GET_MODE_BITSIZE (TImode) >> 1;
+
+  rtx low[2], high[2];
+  int count;
+
+  split_double_mode (TImode, operands, 2, low, high);
+  if (CONST_INT_P (operands[2]))
+    {
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
+
+      if (count >= half_width)
+	{
+	  count = count - half_width;
+	  if (count == 0)
+	    {
+	      if (!rtx_equal_p (high[0], low[1]))
+		emit_move_insn (high[0], low[1]);
+	    }
+	  else if (count == 1)
+	    emit_insn (gen_adddi3 (high[0], low[1], low[1]));
+	  else
+	    emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
+
+	  ix86_expand_clear (low[0]);
+	}
+      else if (count == 1)
+	{
+	  rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
+	  rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
+	  emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
+					     low[1], low[1]));
+	  emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
+				     x3, x4));
+	}
+      else
+	{
+	  emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
+					  GEN_INT (count)));
+	  emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
+				      operands[2]));
+      emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
+      if (TARGET_CMOVE && scratch)
+	{
+	  ix86_expand_clear (scratch);
+	  emit_insn (gen_x86_shift_adj_1
+		     (DImode, high[0], low[0], operands[2], scratch));
+	}
+      else
+	emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
+    }
+}
+
+/* Helper function to split TImode l/ashr under NDD.  */
+void
+ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
+{
+  gcc_assert (TARGET_APX_NDD);
+  int half_width = GET_MODE_BITSIZE (TImode) >> 1;
+  bool ashr_p = code == ASHIFTRT;
+  rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
+					 : gen_lshrdi3;
+
+  rtx low[2], high[2];
+  int count;
+
+  split_double_mode (TImode, operands, 2, low, high);
+  if (CONST_INT_P (operands[2]))
+    {
+      count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
+
+      if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
+	{
+	  emit_insn (gen_shr (high[0], high[1],
+			      GEN_INT (half_width - 1)));
+	  emit_move_insn (low[0], high[0]);
+	}
+      else if (count >= half_width)
+	{
+	  if (ashr_p)
+	    emit_insn (gen_shr (high[0], high[1],
+				GEN_INT (half_width - 1)));
+	  else
+	    ix86_expand_clear (high[0]);
+
+	  if (count > half_width)
+	    emit_insn (gen_shr (low[0], high[1],
+				GEN_INT (count - half_width)));
+	  else
+	    emit_move_insn (low[0], high[1]);
+	}
+      else
+	{
+	  emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
+					  GEN_INT (count)));
+	  emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
+	}
+    }
+  else
+    {
+      emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
+				      operands[2]));
+      emit_insn (gen_shr (high[0], high[1], operands[2]));
+
+      if (TARGET_CMOVE && scratch)
+	{
+	  if (ashr_p)
+	    {
+	      emit_move_insn (scratch, high[0]);
+	      emit_insn (gen_shr (scratch, scratch,
+				  GEN_INT (half_width - 1)));
+	    }
+	  else
+	    ix86_expand_clear (scratch);
+
+	  emit_insn (gen_x86_shift_adj_1
+		     (DImode, low[0], high[0], operands[2], scratch));
+	}
+      else if (ashr_p)
+	emit_insn (gen_x86_shift_adj_3
+		   (DImode, low[0], high[0], operands[2]));
+      else
+	emit_insn (gen_x86_shift_adj_2
+		   (DImode, low[0], high[0], operands[2]));
+    }
+}
+
 /* Expand move of V1TI mode register X to a new TI mode register.  */
 static rtx
 ix86_expand_v1ti_to_ti (rtx x)
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index fa952409729..56349064a6c 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -174,8 +174,10 @@  extern void x86_initialize_trampoline (rtx, rtx, rtx);
 extern rtx ix86_zero_extend_to_Pmode (rtx);
 extern void ix86_split_long_move (rtx[]);
 extern void ix86_split_ashl (rtx *, rtx, machine_mode);
+extern void ix86_split_ashl_ndd (rtx *, rtx);
 extern void ix86_split_ashr (rtx *, rtx, machine_mode);
 extern void ix86_split_lshr (rtx *, rtx, machine_mode);
+extern void ix86_split_rshift_ndd (enum rtx_code, rtx *, rtx);
 extern void ix86_expand_v1ti_shift (enum rtx_code, rtx[]);
 extern void ix86_expand_v1ti_rotate (enum rtx_code, rtx[]);
 extern void ix86_expand_v1ti_ashiftrt (rtx[]);
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 853f53c2bb9..331dda89b29 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14420,13 +14420,14 @@  (define_insn_and_split "*ashl<dwi>3_doubleword_mask_1"
 })
 
 (define_insn "ashl<mode>3_doubleword"
-  [(set (match_operand:DWI 0 "register_operand" "=&r")
-	(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n")
-		    (match_operand:QI 2 "nonmemory_operand" "<S>c")))
+  [(set (match_operand:DWI 0 "register_operand" "=&r,r")
+	(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
+		    (match_operand:QI 2 "nonmemory_operand" "<S>c,<S>c")))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
-  [(set_attr "type" "multi")])
+  [(set_attr "type" "multi")
+   (set_attr "isa" "*,apx_ndd")])
 
 (define_split
   [(set (match_operand:DWI 0 "register_operand")
@@ -14435,7 +14436,15 @@  (define_split
    (clobber (reg:CC FLAGS_REG))]
   "epilogue_completed"
   [(const_int 0)]
-  "ix86_split_ashl (operands, NULL_RTX, <MODE>mode); DONE;")
+{
+  if (TARGET_APX_NDD
+      && !rtx_equal_p (operands[0], operands[1])
+      && REG_P (operands[1]))
+    ix86_split_ashl_ndd (operands, NULL_RTX);
+  else
+    ix86_split_ashl (operands, NULL_RTX, <MODE>mode);
+  DONE;
+})
 
 ;; By default we don't ask for a scratch register, because when DWImode
 ;; values are manipulated, registers are already at a premium.  But if
@@ -14451,7 +14460,15 @@  (define_peephole2
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_ashl (operands, operands[3], <DWI>mode); DONE;")
+{
+  if (TARGET_APX_NDD
+      && !rtx_equal_p (operands[0], operands[1])
+      && (REG_P (operands[1])))
+    ix86_split_ashl_ndd (operands, operands[3]);
+  else
+    ix86_split_ashl (operands, operands[3], <DWI>mode);
+  DONE;
+})
 
 (define_insn_and_split "*ashl<dwi>3_doubleword_highpart"
   [(set (match_operand:<DWI> 0 "register_operand" "=r")
@@ -15708,16 +15725,24 @@  (define_insn_and_split "*<insn><dwi>3_doubleword_mask_1"
 })
 
 (define_insn_and_split "<insn><mode>3_doubleword"
-  [(set (match_operand:DWI 0 "register_operand" "=&r")
-	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0")
-			 (match_operand:QI 2 "nonmemory_operand" "<S>c")))
+  [(set (match_operand:DWI 0 "register_operand" "=&r,r")
+	(any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0,r")
+			 (match_operand:QI 2 "nonmemory_operand" "<S>c,<S>c")))
    (clobber (reg:CC FLAGS_REG))]
   ""
   "#"
   "epilogue_completed"
   [(const_int 0)]
-  "ix86_split_<insn> (operands, NULL_RTX, <MODE>mode); DONE;"
-  [(set_attr "type" "multi")])
+{
+  if (TARGET_APX_NDD
+      && !rtx_equal_p (operands[0], operands[1]))
+    ix86_split_rshift_ndd (<CODE>, operands, NULL_RTX);
+  else
+    ix86_split_<insn> (operands, NULL_RTX, <MODE>mode);
+  DONE;
+}
+  [(set_attr "type" "multi")
+   (set_attr "isa" "*,apx_ndd")])
 
 ;; By default we don't ask for a scratch register, because when DWImode
 ;; values are manipulated, registers are already at a premium.  But if
@@ -15733,7 +15758,14 @@  (define_peephole2
    (match_dup 3)]
   "TARGET_CMOVE"
   [(const_int 0)]
-  "ix86_split_<insn> (operands, operands[3], <DWI>mode); DONE;")
+{
+  if (TARGET_APX_NDD
+      && !rtx_equal_p (operands[0], operands[1]))
+    ix86_split_rshift_ndd (<CODE>, operands, operands[3]);
+  else
+    ix86_split_<insn> (operands, operands[3], <DWI>mode);
+  DONE;
+})
 
 ;; Split truncations of double word right shifts into x86_shrd_1.
 (define_insn_and_split "<insn><dwi>3_doubleword_lowpart"
diff --git a/gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c b/gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
new file mode 100644
index 00000000000..0489712b7f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/apx-ndd-ti-shift.c
@@ -0,0 +1,91 @@ 
+/* { dg-do run { target { int128 && { ! ia32 } } } } */
+/* { dg-require-effective-target apxf } */
+/* { dg-options "-O2" } */
+
+#include <stdlib.h>
+
+#define APX_TARGET __attribute__((noinline, target("apxf")))
+#define NO_APX __attribute__((noinline, target("no-apxf")))
+typedef __uint128_t u128;
+typedef __int128 i128;
+
+#define TI_SHIFT_FUNC(TYPE, op, name) \
+APX_TARGET \
+TYPE apx_##name##TYPE (TYPE a, char b) \
+{ \
+  return a op b; \
+} \
+TYPE noapx_##name##TYPE (TYPE a, char b) \
+{ \
+  return a op b; \
+} \
+
+#define TI_SHIFT_FUNC_CONST(TYPE, i, op, name) \
+APX_TARGET \
+TYPE apx_##name##TYPE##_const (TYPE a) \
+{ \
+  return a op i; \
+} \
+NO_APX \
+TYPE noapx_##name##TYPE##_const (TYPE a) \
+{ \
+  return a op i; \
+}
+
+#define TI_SHIFT_TEST(TYPE, name, val) \
+{\
+  if (apx_##name##TYPE (val, b) != noapx_##name##TYPE (val, b)) \
+    abort (); \
+}
+
+#define TI_SHIFT_CONST_TEST(TYPE, name, val) \
+{\
+  if (apx_##name##1##TYPE##_const (val) \
+      != noapx_##name##1##TYPE##_const (val)) \
+    abort (); \
+  if (apx_##name##2##TYPE##_const (val) \
+      != noapx_##name##2##TYPE##_const (val)) \
+    abort (); \
+  if (apx_##name##3##TYPE##_const (val) \
+      != noapx_##name##3##TYPE##_const (val)) \
+    abort (); \
+  if (apx_##name##4##TYPE##_const (val) \
+      != noapx_##name##4##TYPE##_const (val)) \
+    abort (); \
+}
+
+TI_SHIFT_FUNC(i128, <<, ashl)
+TI_SHIFT_FUNC(i128, >>, ashr)
+TI_SHIFT_FUNC(u128, >>, lshr)
+
+TI_SHIFT_FUNC_CONST(i128, 1, <<, ashl1)
+TI_SHIFT_FUNC_CONST(i128, 65, <<, ashl2)
+TI_SHIFT_FUNC_CONST(i128, 64, <<, ashl3)
+TI_SHIFT_FUNC_CONST(i128, 87, <<, ashl4)
+TI_SHIFT_FUNC_CONST(i128, 127, >>, ashr1)
+TI_SHIFT_FUNC_CONST(i128, 87, >>, ashr2)
+TI_SHIFT_FUNC_CONST(i128, 27, >>, ashr3)
+TI_SHIFT_FUNC_CONST(i128, 64, >>, ashr4)
+TI_SHIFT_FUNC_CONST(u128, 127, >>, lshr1)
+TI_SHIFT_FUNC_CONST(u128, 87, >>, lshr2)
+TI_SHIFT_FUNC_CONST(u128, 27, >>, lshr3)
+TI_SHIFT_FUNC_CONST(u128, 64, >>, lshr4)
+
+int main (void)
+{
+  if (!__builtin_cpu_supports ("apxf"))
+    return 0;
+
+  u128 ival = 0x123456788765432FLL;
+  u128 uval = 0xF234567887654321ULL;
+  char b = 28;
+
+  TI_SHIFT_TEST(i128, ashl, ival)
+  TI_SHIFT_TEST(i128, ashr, ival)
+  TI_SHIFT_TEST(u128, lshr, uval)
+  TI_SHIFT_CONST_TEST(i128, ashl, ival)
+  TI_SHIFT_CONST_TEST(i128, ashr, ival)
+  TI_SHIFT_CONST_TEST(u128, lshr, uval)
+
+  return 0;
+}