@@ -2644,6 +2644,7 @@ (define_expand "rawmemchr<ANYI:mode>"
;; =========================================================================
;; Includes:
;; - add
+;; - sub
;; =========================================================================
(define_expand "usadd<mode>3"
[(match_operand:V_VLSI 0 "register_operand")
@@ -2656,6 +2657,17 @@ (define_expand "usadd<mode>3"
}
)
+(define_expand "ussub<mode>3"
+ [(match_operand:V_VLSI 0 "register_operand")
+ (match_operand:V_VLSI 1 "register_operand")
+ (match_operand:V_VLSI 2 "register_operand")]
+ "TARGET_VECTOR"
+ {
+ riscv_vector::expand_vec_ussub (operands[0], operands[1], operands[2], <MODE>mode);
+ DONE;
+ }
+)
+
;; =========================================================================
;; == Early break auto-vectorization patterns
;; =========================================================================
@@ -638,6 +638,7 @@ void expand_vec_lround (rtx, rtx, machine_mode, machine_mode, machine_mode);
void expand_vec_lceil (rtx, rtx, machine_mode, machine_mode);
void expand_vec_lfloor (rtx, rtx, machine_mode, machine_mode);
void expand_vec_usadd (rtx, rtx, rtx, machine_mode);
+void expand_vec_ussub (rtx, rtx, rtx, machine_mode);
#endif
bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,
bool, void (*)(rtx *, rtx), enum avl_type);
@@ -4634,13 +4634,13 @@ emit_vec_cvt_x_f_rtz (rtx op_dest, rtx op_src, rtx mask,
}
static void
-emit_vec_saddu (rtx op_dest, rtx op_1, rtx op_2, insn_type type,
- machine_mode vec_mode)
+emit_vec_binary_alu (rtx op_dest, rtx op_1, rtx op_2, enum rtx_code rcode,
+ machine_mode vec_mode)
{
rtx ops[] = {op_dest, op_1, op_2};
- insn_code icode = code_for_pred (US_PLUS, vec_mode);
+ insn_code icode = code_for_pred (rcode, vec_mode);
- emit_vlmax_insn (icode, type, ops);
+ emit_vlmax_insn (icode, BINARY_OP, ops);
}
void
@@ -4876,7 +4876,16 @@ expand_vec_lfloor (rtx op_0, rtx op_1, machine_mode vec_fp_mode,
void
expand_vec_usadd (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
{
- emit_vec_saddu (op_0, op_1, op_2, BINARY_OP, vec_mode);
+ emit_vec_binary_alu (op_0, op_1, op_2, US_PLUS, vec_mode);
+}
+
+/* Expand the standard name usadd<mode>3 for vector mode, we can leverage
+ the vector fixed point vector single-width saturating add directly. */
+
+void
+expand_vec_ussub (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+ emit_vec_binary_alu (op_0, op_1, op_2, US_MINUS, vec_mode);
}
/* Vectorize popcount by the Wilkes-Wheeler-Gill algorithm that libgcc uses as
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fdump-rtl-expand-details -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "../../../sat_arith.h"
+
+/*
+** vec_sat_u_sub_uint8_t_fmt_1:
+** ...
+** vsetvli\s+[atx][0-9]+,\s*[atx][0-9]+,\s*e8,\s*m1,\s*ta,\s*ma
+** vle8\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vle8\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vssubu\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+DEF_VEC_SAT_U_SUB_FMT_1(uint8_t)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 4 "expand" } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fdump-rtl-expand-details -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "../../../sat_arith.h"
+
+/*
+** vec_sat_u_sub_uint16_t_fmt_1:
+** ...
+** vsetvli\s+[atx][0-9]+,\s*[atx][0-9]+,\s*e16,\s*m1,\s*ta,\s*ma
+** ...
+** vle16\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vle16\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vssubu\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+DEF_VEC_SAT_U_SUB_FMT_1(uint16_t)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 4 "expand" } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fdump-rtl-expand-details -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "../../../sat_arith.h"
+
+/*
+** vec_sat_u_sub_uint32_t_fmt_1:
+** ...
+** vsetvli\s+[atx][0-9]+,\s*[atx][0-9]+,\s*e32,\s*m1,\s*ta,\s*ma
+** ...
+** vle32\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vle32\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vssubu\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+DEF_VEC_SAT_U_SUB_FMT_1(uint32_t)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 4 "expand" } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fdump-rtl-expand-details -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-skip-if "" { *-*-* } { "-flto" } } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "../../../sat_arith.h"
+
+/*
+** vec_sat_u_sub_uint64_t_fmt_1:
+** ...
+** vsetvli\s+[atx][0-9]+,\s*[atx][0-9]+,\s*e64,\s*m1,\s*ta,\s*ma
+** ...
+** vle64\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vle64\.v\s+v[0-9]+,\s*0\([atx][0-9]+\)
+** vssubu\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+
+** ...
+*/
+DEF_VEC_SAT_U_SUB_FMT_1(uint64_t)
+
+/* { dg-final { scan-rtl-dump-times ".SAT_SUB " 4 "expand" } } */
new file mode 100644
@@ -0,0 +1,75 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "../../../sat_arith.h"
+
+#define T uint8_t
+#define N 16
+#define RUN_VEC_SAT_BINARY RUN_VEC_SAT_U_SUB_FMT_1
+
+DEF_VEC_SAT_U_SUB_FMT_1(T)
+
+T test_data[][3][N] = {
+ {
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ }, /* arg_0 */
+ {
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ }, /* arg_1 */
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ }, /* expect */
+ },
+ {
+ {
+ 0, 255, 255, 255,
+ 0, 255, 255, 255,
+ 0, 255, 255, 255,
+ 0, 255, 255, 255,
+ },
+ {
+ 1, 255, 254, 251,
+ 1, 255, 254, 251,
+ 1, 255, 254, 251,
+ 1, 255, 254, 251,
+ },
+ {
+ 0, 0, 1, 4,
+ 0, 0, 1, 4,
+ 0, 0, 1, 4,
+ 0, 0, 1, 4,
+ },
+ },
+ {
+ {
+ 0, 0, 1, 0,
+ 1, 2, 3, 0,
+ 1, 2, 3, 255,
+ 5, 254, 255, 9,
+ },
+ {
+ 0, 1, 0, 254,
+ 254, 254, 254, 255,
+ 255, 255, 0, 252,
+ 255, 255, 255, 1,
+ },
+ {
+ 0, 0, 1, 0,
+ 0, 0, 0, 0,
+ 0, 0, 3, 3,
+ 0, 0, 0, 8,
+ },
+ },
+};
+
+#include "vec_sat_binary.h"
new file mode 100644
@@ -0,0 +1,75 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "../../../sat_arith.h"
+
+#define T uint16_t
+#define N 16
+#define RUN_VEC_SAT_BINARY RUN_VEC_SAT_U_SUB_FMT_1
+
+DEF_VEC_SAT_U_SUB_FMT_1(T)
+
+T test_data[][3][N] = {
+ {
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ }, /* arg_0 */
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ }, /* arg_1 */
+ {
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 0,
+ }, /* expect */
+ },
+ {
+ {
+ 65535, 65535, 65535, 65535,
+ 65535, 65535, 65535, 65535,
+ 65535, 65535, 65535, 65535,
+ 65535, 65535, 65535, 65535,
+ },
+ {
+ 55535, 45535, 35535, 25535,
+ 55535, 45535, 35535, 25535,
+ 55535, 45535, 35535, 25535,
+ 55535, 45535, 35535, 25535,
+ },
+ {
+ 10000, 20000, 30000, 40000,
+ 10000, 20000, 30000, 40000,
+ 10000, 20000, 30000, 40000,
+ 10000, 20000, 30000, 40000,
+ },
+ },
+ {
+ {
+ 0, 0, 1, 0,
+ 1, 2, 3, 0,
+ 1, 65535, 3, 65535,
+ 5, 65534, 65535, 9,
+ },
+ {
+ 0, 1, 1, 65534,
+ 65534, 65534, 1, 65535,
+ 0, 65535, 65535, 0,
+ 65535, 65535, 1, 2,
+ },
+ {
+ 0, 0, 0, 0,
+ 0, 0, 2, 0,
+ 1, 0, 0, 65535,
+ 0, 0, 65534, 7,
+ },
+ },
+};
+
+#include "vec_sat_binary.h"
new file mode 100644
@@ -0,0 +1,75 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "../../../sat_arith.h"
+
+#define T uint32_t
+#define N 16
+#define RUN_VEC_SAT_BINARY RUN_VEC_SAT_U_SUB_FMT_1
+
+DEF_VEC_SAT_U_SUB_FMT_1(T)
+
+T test_data[][3][N] = {
+ {
+ {
+ 0, 0, 4, 0,
+ 0, 0, 4, 0,
+ 0, 0, 4, 0,
+ 0, 0, 4, 0,
+ }, /* arg_0 */
+ {
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ 0, 1, 2, 3,
+ }, /* arg_1 */
+ {
+ 0, 0, 2, 0,
+ 0, 0, 2, 0,
+ 0, 0, 2, 0,
+ 0, 0, 2, 0,
+ }, /* expect */
+ },
+ {
+ {
+ 4294967295, 4294967295, 4294967295, 4294967295,
+ 4294967295, 4294967295, 4294967295, 4294967295,
+ 4294967295, 4294967295, 4294967295, 4294967295,
+ 4294967295, 4294967295, 4294967295, 4294967295,
+ },
+ {
+ 1294967295, 2294967295, 3294967295, 4294967295,
+ 1294967295, 2294967295, 3294967295, 4294967295,
+ 1294967295, 2294967295, 3294967295, 4294967295,
+ 1294967295, 2294967295, 3294967295, 4294967295,
+ },
+ {
+ 3000000000, 2000000000, 1000000000, 0,
+ 3000000000, 2000000000, 1000000000, 0,
+ 3000000000, 2000000000, 1000000000, 0,
+ 3000000000, 2000000000, 1000000000, 0,
+ },
+ },
+ {
+ {
+ 0, 0, 9, 0,
+ 1, 4294967295, 3, 0,
+ 1, 2, 3, 4,
+ 5, 4294967294, 4294967295, 4294967295,
+ },
+ {
+ 0, 1, 1, 4294967294,
+ 1, 2, 4294967294, 4294967295,
+ 1, 4294967295, 4294967295, 1,
+ 1, 4294967295, 4294967290, 9,
+ },
+ {
+ 0, 0, 8, 0,
+ 0, 4294967293, 0, 0,
+ 0, 0, 0, 3,
+ 4, 0, 5, 4294967286,
+ },
+ },
+};
+
+#include "vec_sat_binary.h"
new file mode 100644
@@ -0,0 +1,75 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99" } */
+
+#include "../../../sat_arith.h"
+
+#define T uint64_t
+#define N 16
+#define RUN_VEC_SAT_BINARY RUN_VEC_SAT_U_SUB_FMT_1
+
+DEF_VEC_SAT_U_SUB_FMT_1(T)
+
+T test_data[][3][N] = {
+ {
+ {
+ 0, 9, 0, 0,
+ 0, 9, 0, 0,
+ 0, 9, 0, 0,
+ 0, 9, 0, 0,
+ }, /* arg_0 */
+ {
+ 0, 2, 3, 1,
+ 0, 2, 3, 1,
+ 0, 2, 3, 1,
+ 0, 2, 3, 1,
+ }, /* arg_1 */
+ {
+ 0, 7, 0, 0,
+ 0, 7, 0, 0,
+ 0, 7, 0, 0,
+ 0, 7, 0, 0,
+ }, /* expect */
+ },
+ {
+ {
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 18446744073709551615u,
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 18446744073709551615u,
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 18446744073709551615u,
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 18446744073709551615u,
+ },
+ {
+ 10446744073709551615u, 11446744073709551615u, 12446744073709551615u, 18446744073709551615u,
+ 10446744073709551615u, 11446744073709551615u, 12446744073709551615u, 18446744073709551615u,
+ 10446744073709551615u, 11446744073709551615u, 12446744073709551615u, 18446744073709551615u,
+ 10446744073709551615u, 11446744073709551615u, 12446744073709551615u, 18446744073709551615u,
+ },
+ {
+ 8000000000000000000u, 7000000000000000000u, 6000000000000000000u, 0u,
+ 8000000000000000000u, 7000000000000000000u, 6000000000000000000u, 0u,
+ 8000000000000000000u, 7000000000000000000u, 6000000000000000000u, 0u,
+ 8000000000000000000u, 7000000000000000000u, 6000000000000000000u, 0u,
+ },
+ },
+ {
+ {
+ 0, 18446744073709551615u, 1, 0,
+ 1, 18446744073709551615u, 3, 0,
+ 1, 18446744073709551614u, 3, 4,
+ 5, 18446744073709551614u, 18446744073709551615u, 9,
+ },
+ {
+ 0, 1, 1, 18446744073709551614u,
+ 18446744073709551614u, 18446744073709551614u, 18446744073709551614u, 18446744073709551615u,
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 18446744073709551615u,
+ 18446744073709551615u, 18446744073709551615u, 18446744073709551615u, 1,
+ },
+ {
+ 0, 18446744073709551614u, 0, 0,
+ 0, 1, 0, 0,
+ 0, 0, 0, 0,
+ 0, 0, 0, 8,
+ },
+ },
+};
+
+#include "vec_sat_binary.h"
@@ -95,4 +95,35 @@ sat_u_sub_##T##_fmt_2 (T x, T y) \
#define RUN_SAT_U_SUB_FMT_1(T, x, y) sat_u_sub_##T##_fmt_1(x, y)
#define RUN_SAT_U_SUB_FMT_2(T, x, y) sat_u_sub_##T##_fmt_2(x, y)
+#define DEF_VEC_SAT_U_SUB_FMT_1(T) \
+void __attribute__((noinline)) \
+vec_sat_u_sub_##T##_fmt_1 (T *out, T *op_1, T *op_2, unsigned limit) \
+{ \
+ unsigned i; \
+ for (i = 0; i < limit; i++) \
+ { \
+ T x = op_1[i]; \
+ T y = op_2[i]; \
+ out[i] = (x - y) & (-(T)(x >= y)); \
+ } \
+}
+
+#define DEF_VEC_SAT_U_SUB_FMT_2(T) \
+void __attribute__((noinline)) \
+vec_sat_u_sub_##T##_fmt_2 (T *out, T *op_1, T *op_2, unsigned limit) \
+{ \
+ unsigned i; \
+ for (i = 0; i < limit; i++) \
+ { \
+ T x = op_1[i]; \
+ T y = op_2[i]; \
+ out[i] = (x - y) & (-(T)(x > y)); \
+ } \
+}
+
+#define RUN_VEC_SAT_U_SUB_FMT_1(T, out, op_1, op_2, N) \
+ vec_sat_u_sub_##T##_fmt_1(out, op_1, op_2, N)
+#define RUN_VEC_SAT_U_SUB_FMT_2(T, out, op_1, op_2, N) \
+ vec_sat_u_sub_##T##_fmt_2(out, op_1, op_2, N)
+
#endif