@@ -6887,7 +6887,7 @@ (define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
[(set_attr "movprfx" "*,yes")]
)
-(define_insn "@<sur>dot_prod<vsi2qi>"
+(define_insn "@<sur>dot_prod<vsi2qi>_insn"
[(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
(plus:VNx4SI_ONLY
(unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@ (define_insn "@<sur>dot_prod<vsi2qi>"
[(set_attr "movprfx" "*,yes")]
)
+(define_expand "@<sur>dot_prod<vsi2qi>"
+ [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+ (plus:VNx4SI_ONLY
+ (unspec:VNx4SI_ONLY
+ [(match_operand:<VSI2QI> 1 "register_operand")
+ (match_operand:<VSI2QI> 2 "register_operand")]
+ DOTPROD_US_ONLY)
+ (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+ "TARGET_SVE || TARGET_SVE_I8MM"
+{
+ if (TARGET_SVE_I8MM)
+ {
+ emit_insn (gen_usdot_prod<vsi2qi>_insn (operands[0], operands[1],
+ operands[2], operands[3]));
+ DONE;
+ }
+
+ machine_mode elemmode = GET_MODE_INNER (<VSI2QI>mode);
+ HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+ rtx signbit = gen_int_mode (val, elemmode);
+ rtx t1 = gen_reg_rtx (<MODE>mode);
+ rtx t2 = gen_reg_rtx (<MODE>mode);
+ rtx tmp = gen_reg_rtx (<VSI2QI>mode);
+ rtx c1 = gen_const_vec_duplicate (<VSI2QI>mode,
+ gen_int_mode (val - 1, elemmode));
+ rtx c2 = gen_const_vec_duplicate (<VSI2QI>mode, gen_int_mode (1, elemmode));
+ rtx dup = gen_const_vec_duplicate (<VSI2QI>mode, signbit);
+ c1 = force_reg (<VSI2QI>mode, c1);
+ c2 = force_reg (<VSI2QI>mode, c2);
+ dup = force_reg (<VSI2QI>mode, dup);
+ emit_insn (gen_sub<vsi2qi>3 (tmp, operands[1], dup));
+ emit_insn (gen_sdot_prod<vsi2qi> (t1, tmp, operands[2], operands[3]));
+ emit_insn (gen_sdot_prod<vsi2qi> (t2, c1, operands[2], t1));
+ emit_insn (gen_sdot_prod<vsi2qi> (operands[0], c2, operands[2], t2));
+ DONE;
+})
+
(define_insn "@aarch64_<sur>dot_prod_lane<vsi2qi>"
[(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
(plus:VNx4SI_ONLY
new file mode 100644
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#pragma GCC target "+noi8mm"
+
+#define N 480
+#define SIGNEDNESS_1 unsigned
+#define SIGNEDNESS_2 signed
+#define SIGNEDNESS_3 signed
+#define SIGNEDNESS_4 unsigned
+
+SIGNEDNESS_1 int __attribute__ ((noipa))
+f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
+ SIGNEDNESS_4 char *restrict b)
+{
+ for (__INTPTR_TYPE__ i = 0; i < N; ++i)
+ {
+ int av = a[i];
+ int bv = b[i];
+ SIGNEDNESS_2 short mult = av * bv;
+ res += mult;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-not {\tusdot\t} } } */
+/* { dg-final { scan-assembler-times {\tsdot\t} 3 } } */