diff mbox series

[to-be-committed,V3,RISC-V] cmpmem for RISCV with V extension

Message ID ac9eecb4-65b6-4764-bc33-09bdb6bfaf36@gmail.com
State New
Headers show
Series [to-be-committed,V3,RISC-V] cmpmem for RISCV with V extension | expand

Commit Message

Jeff Law June 24, 2024, 11:44 p.m. UTC
So this is the cmpmem patch from Sergei, updated for the trunk.

Updates included adjusting the existing cmpmemsi expander to 
conditionally try expansion via vector.  And a minor testsuite 
adjustment to turn off vector expansion in one test that is primarily 
focused on vset optimization and ensuring we don't have extras.

I've spun this in my tester successfully and just want to see a clean 
run through precommit CI before moving forward.

Jeff
gcc/ChangeLog:

	* config/riscv/riscv-protos.h (riscv_vector::expand_vec_cmpmem): New
	function declaration.
	* config/riscv/riscv-string.cc (riscv_vector::expand_vec_cmpmem): New
	function.
	* config/riscv/riscv.md (cmpmemsi): Try riscv_vector::expand_vec_cmpmem
	for constant lengths.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/base/cmpmem-1.c: New codegen tests
	* gcc.target/riscv/rvv/base/cmpmem-2.c: New execution tests
	* gcc.target/riscv/rvv/base/cmpmem-3.c: New codegen tests
	* gcc.target/riscv/rvv/base/cmpmem-4.c: New codegen tests
	* gcc.target/riscv/rvv/autovec/vls/misalign-1.c: Turn off vector mem* and
	str* handling.
diff mbox series

Patch

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index a3380d4250d..a8b76173fa0 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -679,6 +679,7 @@  void expand_rawmemchr (machine_mode, rtx, rtx, rtx, bool = false);
 bool expand_strcmp (rtx, rtx, rtx, rtx, unsigned HOST_WIDE_INT, bool);
 void emit_vec_extract (rtx, rtx, rtx);
 bool expand_vec_setmem (rtx, rtx, rtx);
+bool expand_vec_cmpmem (rtx, rtx, rtx, rtx);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-string.cc b/gcc/config/riscv/riscv-string.cc
index 1ddebdcee3f..257a514d290 100644
--- a/gcc/config/riscv/riscv-string.cc
+++ b/gcc/config/riscv/riscv-string.cc
@@ -1605,4 +1605,104 @@  expand_vec_setmem (rtx dst_in, rtx length_in, rtx fill_value_in)
   return true;
 }
 
+/* Used by cmpmemsi in riscv.md.  */
+
+bool
+expand_vec_cmpmem (rtx result_out, rtx blk_a_in, rtx blk_b_in, rtx length_in)
+{
+  HOST_WIDE_INT lmul;
+  /* Check we are able and allowed to vectorise this operation;
+     bail if not.  */
+  if (!check_vectorise_memory_operation (length_in, lmul))
+    return false;
+
+  /* Strategy:
+     load entire blocks at a and b into vector regs
+     generate mask of bytes that differ
+     find first set bit in mask
+     find offset of first set bit in mask, use 0 if none set
+     result is ((char*)a[offset] - (char*)b[offset])
+   */
+
+  machine_mode vmode
+      = riscv_vector::get_vector_mode (QImode, BYTES_PER_RISCV_VECTOR * lmul)
+	      .require ();
+  rtx blk_a_addr = copy_addr_to_reg (XEXP (blk_a_in, 0));
+  rtx blk_a = change_address (blk_a_in, vmode, blk_a_addr);
+  rtx blk_b_addr = copy_addr_to_reg (XEXP (blk_b_in, 0));
+  rtx blk_b = change_address (blk_b_in, vmode, blk_b_addr);
+
+  rtx vec_a = gen_reg_rtx (vmode);
+  rtx vec_b = gen_reg_rtx (vmode);
+
+  machine_mode mask_mode = get_mask_mode (vmode);
+  rtx mask = gen_reg_rtx (mask_mode);
+  rtx mismatch_ofs = gen_reg_rtx (Pmode);
+
+  rtx ne = gen_rtx_NE (mask_mode, vec_a, vec_b);
+  rtx vmsops[] = { mask, ne, vec_a, vec_b };
+  rtx vfops[] = { mismatch_ofs, mask };
+
+  /* If the length is exactly vlmax for the selected mode, do that.
+     Otherwise, use a predicated store.  */
+
+  if (known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
+    {
+      emit_move_insn (vec_a, blk_a);
+      emit_move_insn (vec_b, blk_b);
+      emit_vlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP,
+		       vmsops);
+
+      emit_vlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+		       riscv_vector::CPOP_OP, vfops);
+    }
+  else
+    {
+      if (!satisfies_constraint_K (length_in))
+	      length_in = force_reg (Pmode, length_in);
+
+      rtx memmask = CONSTM1_RTX (mask_mode);
+
+      rtx m_ops_a[] = { vec_a, memmask, blk_a };
+      rtx m_ops_b[] = { vec_b, memmask, blk_b };
+
+      emit_nonvlmax_insn (code_for_pred_mov (vmode),
+			  riscv_vector::UNARY_OP_TAMA, m_ops_a, length_in);
+      emit_nonvlmax_insn (code_for_pred_mov (vmode),
+			  riscv_vector::UNARY_OP_TAMA, m_ops_b, length_in);
+
+      emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP,
+			  vmsops, length_in);
+
+      emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
+			  riscv_vector::CPOP_OP, vfops, length_in);
+    }
+
+  /* Mismatch_ofs is -1 if blocks match, or the offset of
+     the first mismatch otherwise.  */
+  rtx ltz = gen_reg_rtx (Xmode);
+  emit_insn (gen_slt_3 (LT, Xmode, Xmode, ltz, mismatch_ofs, const0_rtx));
+  /* mismatch_ofs += (mismatch_ofs < 0) ? 1 : 0.  */
+  emit_insn (
+      gen_rtx_SET (mismatch_ofs, gen_rtx_PLUS (Pmode, mismatch_ofs, ltz)));
+
+  /* Unconditionally load the bytes at mismatch_ofs and subtract them
+     to get our result.  */
+  emit_insn (gen_rtx_SET (blk_a_addr,
+			  gen_rtx_PLUS (Pmode, mismatch_ofs, blk_a_addr)));
+  emit_insn (gen_rtx_SET (blk_b_addr,
+			  gen_rtx_PLUS (Pmode, mismatch_ofs, blk_b_addr)));
+
+  blk_a = change_address (blk_a, QImode, blk_a_addr);
+  blk_b = change_address (blk_b, QImode, blk_b_addr);
+
+  rtx byte_a = gen_reg_rtx (SImode);
+  rtx byte_b = gen_reg_rtx (SImode);
+  do_zero_extendqi2 (byte_a, blk_a);
+  do_zero_extendqi2 (byte_b, blk_b);
+
+  emit_insn (gen_rtx_SET (result_out, gen_rtx_MINUS (SImode, byte_a, byte_b)));
+
+  return true;
+}
 }
diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 78cf83c9252..ff37125e3f2 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -2669,6 +2669,12 @@  (define_expand "cmpmemsi"
 	      (use (match_operand:SI 4))])]
   "!optimize_size"
 {
+  /* If TARGET_VECTOR is false, this routine will return false and we will
+     try scalar expansion.  */
+  if (riscv_vector::expand_vec_cmpmem (operands[0], operands[1],
+				       operands[2], operands[3]))
+    DONE;
+
   if (riscv_expand_block_compare (operands[0], operands[1], operands[2],
                                   operands[3]))
     DONE;
@@ -2717,7 +2723,6 @@  (define_expand "setmem<mode>"
     FAIL;
 })
 
-
 ;; Expand in-line code to clear the instruction cache between operand[0] and
 ;; operand[1].
 (define_expand "clear_cache"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/misalign-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/misalign-1.c
index 5184a295e16..9d698b421d6 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/misalign-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/misalign-1.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m4 -fno-tree-loop-distribute-patterns -mno-vector-strict-align" } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2 -mrvv-max-lmul=m4 -fno-tree-loop-distribute-patterns -mno-vector-strict-align -mstringop-strategy=libcall" } */
 
 #include <stdlib.h>
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-1.c
new file mode 100644
index 00000000000..6bc8b07bc2c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-1.c
@@ -0,0 +1,88 @@ 
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=dynamic" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Trivial memcmp should use inline scalar ops.
+** f1:
+**  lbu\s+a\d+,0\(a0\)
+**  lbu\s+a\d+,0\(a1\)
+**  subw?\s+a0,a\d+,a\d+
+**  ret
+*/
+int
+f1 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, 1);
+}
+
+/* Tiny __builtin_memcmp should use libc.
+** f2:
+**  li\s+a\d,\d+
+**  tail\s+memcmp
+*/
+int
+f2 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=1
+** f3:
+**  (
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
+**  )
+**  ...
+**  ret
+*/
+int
+f3 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorised code should use smallest lmul known to fit length
+** f4:
+**  (
+**  vsetivli\s+zero,\d+,e8,m2,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m2,ta,ma
+**  )
+**  ...
+**  ret
+*/
+int
+f4 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES + 1);
+}
+
+/* Vectorise+inline up to LMUL=8
+** f5:
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  ...
+**  ret
+*/
+int
+f5 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f6:
+**  li\s+a2,\d+
+**  tail\s+memcmp
+*/
+int
+f6 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-2.c
new file mode 100644
index 00000000000..c782cc6c6e6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-2.c
@@ -0,0 +1,74 @@ 
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-add-options riscv_v } */
+/* { dg-options "-O2 -mrvv-max-lmul=dynamic" } */
+
+#include <stdlib.h>
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+static inline __attribute__ ((always_inline)) void
+do_one_test (int const size, int const diff_offset, int const diff_dir)
+{
+  unsigned char A[size];
+  unsigned char B[size];
+  unsigned char const fill_value = 0x55;
+  __builtin_memset (A, fill_value, size);
+  __builtin_memset (B, fill_value, size);
+
+  if (diff_dir != 0)
+    {
+      if (diff_dir < 0)
+        {
+          A[diff_offset] = fill_value - 1;
+        }
+      else
+        {
+          A[diff_offset] = fill_value + 1;
+        }
+    }
+
+  if (__builtin_memcmp (A, B, size) != diff_dir)
+    {
+      abort ();
+    }
+}
+
+int
+main ()
+{
+  do_one_test (0, 0, 0);
+
+  do_one_test (1, 0, -1);
+  do_one_test (1, 0, 0);
+  do_one_test (1, 0, 1);
+
+  do_one_test (MIN_VECTOR_BYTES - 1, 0, -1);
+  do_one_test (MIN_VECTOR_BYTES - 1, 0, 0);
+  do_one_test (MIN_VECTOR_BYTES - 1, 0, 1);
+  do_one_test (MIN_VECTOR_BYTES - 1, 1, -1);
+  do_one_test (MIN_VECTOR_BYTES - 1, 1, 0);
+  do_one_test (MIN_VECTOR_BYTES - 1, 1, 1);
+
+  do_one_test (MIN_VECTOR_BYTES, 0, -1);
+  do_one_test (MIN_VECTOR_BYTES, 0, 0);
+  do_one_test (MIN_VECTOR_BYTES, 0, 1);
+  do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, -1);
+  do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, 0);
+  do_one_test (MIN_VECTOR_BYTES, MIN_VECTOR_BYTES - 1, 1);
+
+  do_one_test (MIN_VECTOR_BYTES + 1, 0, -1);
+  do_one_test (MIN_VECTOR_BYTES + 1, 0, 0);
+  do_one_test (MIN_VECTOR_BYTES + 1, 0, 1);
+  do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, -1);
+  do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, 0);
+  do_one_test (MIN_VECTOR_BYTES + 1, MIN_VECTOR_BYTES, 1);
+
+  do_one_test (MIN_VECTOR_BYTES * 8, 0, -1);
+  do_one_test (MIN_VECTOR_BYTES * 8, 0, 0);
+  do_one_test (MIN_VECTOR_BYTES * 8, 0, 1);
+  do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, -1);
+  do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, 0);
+  do_one_test (MIN_VECTOR_BYTES * 8, MIN_VECTOR_BYTES * 8 - 1, 1);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-3.c
new file mode 100644
index 00000000000..5ca31af90fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-3.c
@@ -0,0 +1,45 @@ 
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m1" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Tiny __builtin_memcmp should use libc.
+** f1:
+**  li\s+a\d,\d+
+**  tail\s+memcmp
+*/
+int
+f1 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=1
+** f2:
+**  (
+**  vsetivli\s+zero,\d+,e8,m1,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m1,ta,ma
+**  )
+**  ...
+**  ret
+*/
+int
+f2 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f3:
+**  li\s+a2,\d+
+**  tail\s+memcmp
+*/
+int
+f3 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES + 1);
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-4.c b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-4.c
new file mode 100644
index 00000000000..5860b27a233
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/cmpmem-4.c
@@ -0,0 +1,62 @@ 
+/* { dg-do compile } */
+/* { dg-add-options riscv_v } */
+/* { dg-additional-options "-O3 -mrvv-max-lmul=m8" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#define MIN_VECTOR_BYTES (__riscv_v_min_vlen / 8)
+
+/* Tiny __builtin_memcmp should use libc.
+** f1:
+**  li\s+a\d,\d+
+**  tail\s+memcmp
+*/
+int
+f1 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES - 1);
+}
+
+/* Vectorise+inline minimum vector register width with LMUL=8 as requested
+** f2:
+**  (
+**  vsetivli\s+zero,\d+,e8,m8,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  )
+**  ...
+**  ret
+*/
+int
+f2 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES);
+}
+
+/* Vectorise+inline anything that fits
+** f3:
+**  (
+**  vsetivli\s+zero,\d+,e8,m8,ta,ma
+**  |
+**  li\s+a\d+,\d+
+**  vsetvli\s+zero,a\d+,e8,m8,ta,ma
+**  )
+**  ...
+**  ret
+*/
+int
+f3 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8);
+}
+
+/* Don't inline if the length is too large for one operation.
+** f4:
+**  li\s+a2,\d+
+**  tail\s+memcmp
+*/
+int
+f4 (void *a, void *b)
+{
+  return __builtin_memcmp (a, b, MIN_VECTOR_BYTES * 8 + 1);
+}