@@ -3883,31 +3883,58 @@ (define_expand "cbranch<mode>4"
"TARGET_SIMD"
{
auto code = GET_CODE (operands[0]);
- rtx tmp = operands[1];
-
- /* If comparing against a non-zero vector we have to do a comparison first
- so we can have a != 0 comparison with the result. */
- if (operands[2] != CONST0_RTX (<MODE>mode))
- emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
- operands[2]));
-
- /* For 64-bit vectors we need no reductions. */
- if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ /* If SVE is available, lets borrow some instructions. We will optimize
+ these further later in combine. */
+ if (TARGET_SVE)
{
- /* Always reduce using a V4SI. */
- rtx reduc = gen_lowpart (V4SImode, tmp);
- rtx res = gen_reg_rtx (V4SImode);
- emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
- emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+ machine_mode full_mode = aarch64_full_sve_mode (<VEL>mode).require ();
+ rtx in1 = lowpart_subreg (full_mode, operands[1], <MODE>mode);
+ rtx in2 = lowpart_subreg (full_mode, operands[2], <MODE>mode);
+
+ machine_mode pred_mode = aarch64_sve_pred_mode (full_mode);
+ rtx_vector_builder builder (VNx16BImode, 16, 2);
+ for (unsigned int i = 0; i < 16; ++i)
+ builder.quick_push (CONST1_RTX (BImode));
+ for (unsigned int i = 0; i < 16; ++i)
+ builder.quick_push (CONST0_RTX (BImode));
+ rtx ptrue = force_reg (VNx16BImode, builder.build ());
+ rtx cast_ptrue = gen_lowpart (pred_mode, ptrue);
+ rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+
+ rtx tmp = gen_reg_rtx (pred_mode);
+ aarch64_expand_sve_vec_cmp_int (tmp, reverse_condition (code), in1, in2);
+ emit_insn (gen_aarch64_ptest (pred_mode, ptrue, cast_ptrue, ptrue_flag, tmp));
+ operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+ operands[2] = const0_rtx;
}
+ else
+ {
+ rtx tmp = operands[1];
- rtx val = gen_reg_rtx (DImode);
- emit_move_insn (val, gen_lowpart (DImode, tmp));
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
+ operands[2]));
- rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
- rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
- emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
- DONE;
+ /* For 64-bit vectors we need no reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ rtx reduc = gen_lowpart (V4SImode, tmp);
+ rtx res = gen_reg_rtx (V4SImode);
+ emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+ emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+ }
+
+ rtx val = gen_reg_rtx (DImode);
+ emit_move_insn (val, gen_lowpart (DImode, tmp));
+
+ rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
+ rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
+ emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+ DONE;
+ }
})
;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
@@ -8123,6 +8123,105 @@ (define_insn "*aarch64_pred_cmp<cmp_op><mode>_wide_ptest"
"cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
)
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmp<UCOMPARISONS:cmp_op><mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (neg:<V128>
+ (UCOMPARISONS:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmp<UCOMPARISONS:cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmpeq<mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (neg:<V128>
+ (eq:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmpeq\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Same as the above but version for == and !=
+(define_insn "*aarch64_pred_cmpne<mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (plus:<V128>
+ (eq:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))
+ (match_operand:<V128> 9 "aarch64_simd_imm_minus_one" "i")) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmpne\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] While tests
;; -------------------------------------------------------------------------
@@ -8602,7 +8701,7 @@ (define_expand "cbranch<mode>4"
)
;; See "Description of UNSPEC_PTEST" above for details.
-(define_insn "aarch64_ptest<mode>"
+(define_insn "@aarch64_ptest<mode>"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
(match_operand 1)
@@ -906,6 +906,7 @@ from the machine description file `md'. */\n\n");
printf ("#include \"tm-constrs.h\"\n");
printf ("#include \"ggc.h\"\n");
printf ("#include \"target.h\"\n\n");
+ printf ("#include \"rtx-vector-builder.h\"\n\n");
/* Read the machine description. */
new file mode 100644
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+** ...
+** cmpgt p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** cmpge p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** cmpeq p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** cmplt p[0-9]+.s, p7/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any .L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** cmple p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
new file mode 100644
@@ -0,0 +1,114 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param=aarch64-autovec-preference=1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+** ...
+** cmgt v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** cmge v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** cmpeq p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+** b.any \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+** b.any \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** cmlt v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** cmle v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}