@@ -26817,4 +26817,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_m
emit_move_insn (output, gen_lowpart (out_mode, d.target));
}
+/* Implement truncv8sfv8bf2 with vector permutation. */
+void
+ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
+{
+ machine_mode vperm_mode, src_mode = GET_MODE (src);
+ switch (src_mode)
+ {
+ case V16SFmode:
+ vperm_mode = V32BFmode;
+ break;
+ case V8SFmode:
+ vperm_mode = V16BFmode;
+ break;
+ case V4SFmode:
+ vperm_mode = V8BFmode;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ int nelt = GET_MODE_NUNITS (vperm_mode);
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
+ for (int i = 0; i != nelt; i++)
+ sel[i] = (2 * i + 1) % nelt;
+ vec_perm_indices indices (sel, 1, nelt);
+
+ rtx target = gen_reg_rtx (vperm_mode);
+ rtx op0 = lowpart_subreg (vperm_mode,
+ force_reg (src_mode, src),
+ src_mode);
+ bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
+ target, op0, op0, indices);
+ gcc_assert (ok);
+ emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
+}
+
+
#include "gt-i386-expand.h"
@@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args);
extern bool ix86_ternlog_operand_p (rtx op);
extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
int idx, rtx target);
+extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
#ifdef TREE_CODE
extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
@@ -2994,6 +2994,24 @@ (define_expand "truncv2sfv2hf2"
DONE;
})
+(define_expand "truncv2sfv2bf2"
+ [(set (match_operand:V2BF 0 "register_operand")
+ (float_truncate:V2BF
+ (match_operand:V2SF 1 "nonimmediate_operand")))]
+ "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+{
+ rtx op1 = gen_reg_rtx (V4SFmode);
+ rtx op0 = gen_reg_rtx (V4BFmode);
+
+ emit_move_insn (op1, lowpart_subreg (V4SFmode,
+ force_reg (V2SFmode, operands[1]),
+ V2SFmode));
+ emit_insn (gen_truncv4sfv4bf2 (op0, op1));
+
+ emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode));
+ DONE;
+})
+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel integral arithmetic
@@ -30952,6 +30952,24 @@ (define_insn "avx512f_cvtne2ps2bf16_<mode><mask_name>"
"TARGET_AVX512BF16"
"vcvtne2ps2bf16\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}")
+(define_expand "truncv4sfv4bf2"
+ [(set (match_operand:V4BF 0 "register_operand")
+ (float_truncate:V4BF
+ (match_operand:V4SF 1 "nonimmediate_operand")))]
+ "TARGET_SSSE3"
+{
+ if (!TARGET_AVXNECONVERT
+ && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ else
+ {
+ rtx dest = gen_reg_rtx (V8BFmode);
+ emit_insn (gen_vcvtneps2bf16_v4sf (dest, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (V4BFmode, dest, V8BFmode));
+ }
+ DONE;
+})
+
(define_expand "vcvtneps2bf16_v4sf"
[(set (match_operand:V8BF 0 "register_operand")
(vec_concat:V8BF
@@ -31027,6 +31045,20 @@ (define_expand "avx512f_cvtneps2bf16_<mode>_maskz"
DONE;
})
+(define_expand "truncv8sfv8bf2"
+ [(set (match_operand:V8BF 0 "register_operand")
+ (float_truncate:V8BF
+ (match_operand:V8SF 1 "nonimmediate_operand")))]
+ "TARGET_AVX2"
+{
+ if (!TARGET_AVXNECONVERT
+ && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
+ {
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ DONE;
+ }
+})
+
(define_insn "vcvtneps2bf16_v8sf"
[(set (match_operand:V8BF 0 "register_operand" "=x,v")
(float_truncate:V8BF
@@ -31039,6 +31071,18 @@ (define_insn "vcvtneps2bf16_v8sf"
(set_attr "addr" "gpr16,*")
(set_attr "prefix" "vex,evex")])
+(define_expand "truncv16sfv16bf2"
+ [(set (match_operand:V16BF 0 "register_operand")
+ (float_truncate:V16BF
+ (match_operand:V16SF 1 "nonimmediate_operand")))]
+ "TARGET_AVX512BW && TARGET_EVEX512"
+{
+ if (!TARGET_AVX512BF16)
+ {
+ ix86_expand_vector_sf2bf_with_vec_perm (operands[0], operands[1]);
+ DONE;
+ }
+})
(define_insn "avx512f_cvtneps2bf16_<mode><mask_name>"
[(set (match_operand:<sf_cvt_bf16> 0 "register_operand" "=v")
new file mode 100644
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
+
+#include "avx512bw-truncsfbf.c"
new file mode 100644
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
+
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v16sf __attribute__((vector_size(64)));
+typedef __bf16 v4bf __attribute__((vector_size(8)));
+typedef __bf16 v8bf __attribute__((vector_size(16)));
+typedef __bf16 v16bf __attribute__((vector_size(32)));
+
+v4bf
+foo (v4sf b, v4sf a)
+{
+ return __builtin_convertvector (a, v4bf);
+}
+
+v8bf
+foo2 (v8sf b, v8sf a)
+{
+ return __builtin_convertvector (a, v8bf);
+}
+
+v16bf
+foo3 (v16sf b, v16sf a)
+{
+ return __builtin_convertvector (a, v16bf);
+}
+
+v4bf
+foo_mem (v4sf* a)
+{
+ return __builtin_convertvector (*a, v4bf);
+}
+
+v8bf
+foo2_mem (v8sf* a)
+{
+ return __builtin_convertvector (*a, v8bf);
+}
+
+v16bf
+foo3_mem (v16sf* a)
+{
+ return __builtin_convertvector (*a, v16bf);
+}
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
+
+typedef float v2sf __attribute__((vector_size(8)));
+typedef __bf16 v2bf __attribute__((vector_size(4)));
+
+v2bf
+foo (v2sf b, v2sf a)
+{
+ return __builtin_convertvector (a, v2bf);
+}
+
+
+v2bf
+foo_mem (v2sf* a)
+{
+ return __builtin_convertvector (*a, v2bf);
+}
+