@@ -160,6 +160,7 @@ arm_cpu_builtins (struct cpp_reader* pfile)
TARGET_VFP_FP16INST);
def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC",
TARGET_NEON_FP16INST);
+ def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_FML", TARGET_FP16FML);
def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA);
def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
@@ -165,6 +165,9 @@ define feature fp16
# Dot Product instructions extension to ARMv8.2-a.
define feature dotprod
+# Half-precision floating-point instructions in ARMv8.4-A.
+define feature fp16fml
+
# ISA Quirks (errata?). Don't forget to add this to the fgroup
# ALL_QUIRKS below.
@@ -202,7 +205,7 @@ define fgroup ALL_CRYPTO crypto
# strip off 32 D-registers, but does not remove support for
# double-precision FP.
define fgroup ALL_SIMD_INTERNAL fp_d32 neon ALL_CRYPTO
-define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod
+define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod fp16fml
# List of all FPU bits to strip out if -mfpu is used to override the
# default. fp16 is deliberately missing from this list.
@@ -581,6 +584,7 @@ begin arch armv8.2-a
isa ARMv8_2a
option simd add FP_ARMv8 NEON
option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
option crypto add FP_ARMv8 CRYPTO
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
@@ -595,6 +599,7 @@ begin arch armv8.3-a
isa ARMv8_3a
option simd add FP_ARMv8 NEON
option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
option crypto add FP_ARMv8 CRYPTO
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
@@ -608,7 +613,7 @@ begin arch armv8.4-a
profile A
isa ARMv8_4a
option simd add FP_ARMv8 DOTPROD
- option fp16 add fp16 FP_ARMv8 DOTPROD
+ option fp16 add fp16 fp16fml FP_ARMv8 DOTPROD
option crypto add FP_ARMv8 CRYPTO DOTPROD
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
@@ -67,6 +67,7 @@ VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */
VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */
VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */
VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
+VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
/* Fraction and accumulator vector modes. */
VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */
@@ -216,10 +216,18 @@ extern tree arm_fp16_type_node;
isa_bit_dotprod) \
&& arm_arch8_2)
-/* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */
+/* FPU supports the floating point FP16 instructions for ARMv8.2-A
+ and later. */
#define TARGET_VFP_FP16INST \
(TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst)
+/* Target supports the floating point FP16 instructions from ARMv8.2-A
+ and later. */
+#define TARGET_FP16FML (TARGET_NEON \
+ && bitmap_bit_p (arm_active_target.isa, \
+ isa_bit_fp16fml) \
+ && arm_arch8_2)
+
/* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later. */
#define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA)
@@ -18104,6 +18104,69 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
#pragma GCC pop_options
#endif
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16fml")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmal_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmsl_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmal_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmsl_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmal_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmsl_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmal_highv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+#endif
+
#ifdef __cplusplus
}
#endif
@@ -51,6 +51,10 @@ VAR2 (TERNOP, vqdmlal, v4hi, v2si)
VAR2 (TERNOP, vqdmlsl, v4hi, v2si)
VAR4 (TERNOP, vqrdmlah, v4hi, v2si, v8hi, v4si)
VAR4 (TERNOP, vqrdmlsh, v4hi, v2si, v8hi, v4si)
+VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
@@ -247,6 +247,9 @@ (define_code_iterator SHIFTABLE_OPS [plus minus ior xor and])
;; Operations on the sign of a number.
(define_code_iterator ABSNEG [abs neg])
+;; The PLUS and MINUS operators.
+(define_code_iterator PLUSMINUS [plus minus])
+
;; Conversions.
(define_code_iterator FCVT [unsigned_float float])
@@ -266,6 +269,8 @@ (define_code_attr cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le")
(define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")])
+(define_code_attr vfml_op [(plus "a") (minus "s")])
+
;;----------------------------------------------------------------------------
;; Int iterators
;;----------------------------------------------------------------------------
@@ -412,6 +417,8 @@ (define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE])
(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
+(define_int_iterator VFMLHALVES [UNSPEC_VFML_LO UNSPEC_VFML_HI])
+
;;----------------------------------------------------------------------------
;; Mode attributes
;;----------------------------------------------------------------------------
@@ -471,6 +478,12 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
(V2SF "V2SF") (V4SF "V2SF")
(DI "V2DI") (V2DI "V2DI")])
+;; Mode mapping for VFM[A,S]L instructions.
+(define_mode_attr VFML [(V2SF "V4HF") (V4SF "V8HF")])
+
+;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
+(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
+
;; Similar, for three elements.
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK")
@@ -494,8 +507,14 @@ (define_mode_attr V_reg [(V8QI "P") (V16QI "q")
(V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q")
(DI "P") (V2DI "q")
- (SF "") (DF "P")
- (HF "")])
+ (V2HF "") (SF "")
+ (DF "P") (HF "")])
+
+;; Output template to select the high VFP register of a mult-register value.
+(define_mode_attr V_hi [(V2SF "p") (V4SF "f")])
+
+;; Output template to select the low VFP register of a mult-register value.
+(define_mode_attr V_lo [(V2SF "") (V4SF "e")])
;; Wider modes with the same number of elements.
(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
@@ -708,6 +727,7 @@ (define_mode_attr V_innermode [(V8QI "QI") (V4HI "HI") (V2SI "SI")])
(define_mode_attr F_constraint [(SF "t") (DF "w")])
(define_mode_attr vfp_type [(SF "s") (DF "d")])
(define_mode_attr vfp_double_cond [(SF "") (DF "&& TARGET_VFP_DOUBLE")])
+(define_mode_attr VF_constraint [(V2SF "t") (V4SF "w")])
;; Mode attribute used to build the "type" attribute.
(define_mode_attr q [(V8QI "") (V16QI "_q")
@@ -824,6 +844,12 @@ (define_int_attr sup [
(UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
])
+(define_int_attr vfml_half
+ [(UNSPEC_VFML_HI "high") (UNSPEC_VFML_LO "low")])
+
+(define_int_attr vfml_half_selector
+ [(UNSPEC_VFML_HI "true") (UNSPEC_VFML_LO "false")])
+
(define_int_attr vcvth_op
[(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a")
(UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m")
@@ -2290,6 +2290,98 @@ (define_expand "neon_vfms<VH:mode>"
DONE;
})
+;; The expand RTL structure here is not important.
+;; We use the gen_* functions anyway.
+;; We just need something to wrap the iterators around.
+
+(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>"
+ [(set (match_operand:VCVTF 0 "s_register_operand")
+ (unspec:VCVTF
+ [(match_operand:VCVTF 1 "s_register_operand")
+ (PLUSMINUS:<VFML>
+ (match_operand:<VFML> 2 "s_register_operand")
+ (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))]
+ "TARGET_FP16FML"
+{
+ rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+ emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0],
+ operands[1],
+ operands[2],
+ operands[3],
+ half, half));
+ DONE;
+})
+
+(define_insn "vfmal_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (neg:<VFMLSEL>
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmal_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (neg:<VFMLSEL>
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmla<mode>_unspec"
@@ -36,7 +36,7 @@ v7ve_fps := vfpv3-d16 vfpv3 vfpv3-d16-fp16 vfpv3-fp16 vfpv4 neon \
# Not all these permutations exist for all architecture variants, but
# it seems to work ok.
-v8_fps := simd fp16 crypto fp16+crypto dotprod
+v8_fps := simd fp16 crypto fp16+crypto dotprod fp16fml
# We don't do anything special with these. Pre-v4t probably doesn't work.
all_early_nofp := armv2 armv2a armv3 armv3m armv4 armv4t armv5 armv5t
@@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd
v8_a_nosimd_variants := +crc
v8_a_simd_variants := $(call all_feat_combs, simd crypto)
v8_1_a_simd_variants := $(call all_feat_combs, simd crypto)
-v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto dotprod)
+v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 fp16fml crypto dotprod)
v8_4_a_simd_variants := $(call all_feat_combs, simd fp16 crypto)
ifneq (,$(HAS_APROFILE))
@@ -412,4 +412,6 @@ (define_c_enum "unspec" [
UNSPEC_VRNDX
UNSPEC_DOT_S
UNSPEC_DOT_U
+ UNSPEC_VFML_LO
+ UNSPEC_VFML_HI
])
@@ -15860,6 +15860,11 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions.
The half-precision floating-point data processing instructions.
This also enables the Advanced SIMD and floating-point instructions.
+@item +fp16fml
+The half-precision floating-point fmla extension. This also enables
+the half-precision floating-point extension and Advanced SIMD and
+floating-point instructions.
+
@item +simd
The ARMv8.1-A Advanced SIMD and floating-point instructions.
@@ -15882,7 +15887,8 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions.
@item +fp16
The half-precision floating-point data processing instructions.
This also enables the Advanced SIMD and floating-point instructions as well
-as the Dot Product extension.
+as the Dot Product extension and the half-precision floating-point fmla
+extension.
@item +simd
The ARMv8.3-A Advanced SIMD and floating-point instructions as well as the
@@ -1769,6 +1769,12 @@ ARM target supports executing instructions from ARMv8.2-A with the Dot
Product extension. Some multilibs may be incompatible with these options.
Implies arm_v8_2a_dotprod_neon_ok.
+@item arm_fp16fml_neon_ok
+@anchor{arm_fp16fml_neon_ok}
+ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
+half-precision floating-point instructions available from ARMv8.2-A and
+onwards. Some multilibs may be incompatible with these options.
+
@item arm_prefer_ldrd_strd
ARM target prefers @code{LDRD} and @code{STRD} instructions over
@code{LDM} and @code{STM} instructions.
@@ -2384,6 +2390,11 @@ Add options for ARMv8.2-A with Adv.SIMD Dot Product support, if this is
supported by the target; see the
@ref{arm_v8_2a_dotprod_neon_ok} effective target keyword.
+@item arm_fp16fml_neon
+Add options to enable generation of the @code{VFMAL} and @code{VFMSL}
+instructions, if this is supported by the target; see the
+@ref{arm_fp16fml_neon_ok} effective target keyword.
+
@item bind_pic_locally
Add the target-specific flags needed to enable functions to bind
locally when using pic/PIC passes in the testsuite.
@@ -92,6 +92,14 @@ if {[multilib_config "aprofile"] } {
{-march=armv8.3-a+simd+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.3-a+simd+dotprod+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
{-march=armv8.3-a+simd+nofp+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.2-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+ {-march=armv8.2-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.2-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+ {-march=armv8.2-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.3-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+ {-march=armv8.3-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.3-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+ {-march=armv8.3-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.4-a+crypto -mfloat-abi=soft} "thumb/v8-a/nofp"
{-march=armv8.4-a+simd+crypto -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.4-a+simd+crypto+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlal_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlalq_high_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlsl_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlslq_high_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlal_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlalq_low_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlsl_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlslq_low_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
@@ -4442,6 +4442,48 @@ proc add_options_for_arm_v8_2a_dotprod_neon { flags } {
return "$flags $et_arm_v8_2a_dotprod_neon_flags"
}
+# Return 1 if the target supports FP16 VFMAL and VFMSL
+# instructions, 0 otherwise.
+# Record the command line options needed.
+
+proc check_effective_target_arm_fp16fml_neon_ok_nocache { } {
+ global et_arm_fp16fml_neon_flags
+ set et_arm_fp16fml_neon_flags ""
+
+ if { ![istarget arm*-*-*] } {
+ return 0;
+ }
+
+ # Iterate through sets of options to find the compiler flags that
+ # need to be added to the -march option.
+ foreach flags {"" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" "-mfloat-abi=hard -mfpu=neon-fp-armv8"} {
+ if { [check_no_compiler_messages_nocache \
+ arm_fp16fml_neon_ok object {
+ #if !defined (__ARM_FEATURE_FP16_FML)
+ #error "__ARM_FEATURE_FP16_FML not defined"
+ #endif
+ } "$flags -march=armv8.2-a+fp16fml"] } {
+ set et_arm_fp16fml_neon_flags "$flags -march=armv8.2-a+fp16fml"
+ return 1
+ }
+ }
+
+ return 0;
+}
+
+proc check_effective_target_arm_fp16fml_neon_ok { } {
+ return [check_cached_effective_target arm_fp16fml_neon_ok \
+ check_effective_target_arm_fp16fml_neon_ok_nocache]
+}
+
+proc add_options_for_arm_fp16fml_neon { flags } {
+ if { ! [check_effective_target_arm_fp16fml_neon_ok] } {
+ return "$flags"
+ }
+ global et_arm_fp16fml_neon_flags
+ return "$flags $et_arm_fp16fml_neon_flags"
+}
+
# Return 1 if the target supports executing ARMv8 NEON instructions, 0
# otherwise.