[arm,2/3] Implement fp16fml extension for ARMv8.4-A

Message ID	5A53AFDA.3020507@foss.arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-470448-incoming=patchwork.ozlabs.org@gcc.gnu.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:cc:subject:content-type; q=dns; s=default; b=YB8fNIw1lXT9uHMHEbsLS/31u9zjhNFrhCpO2VUkOL6 T/SyqU9QEEqGFJaDRnD5VKBDR0HVuXHRw9rSKwY4mssUm4BVXf9p/yFrJEklFgVq e7eBs/4/WQTtjvh7wX469Lt3QlKUgiR6bpx6PSwCb4mNzHw49NLPTRm9cxxXQVnY = Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org Message-ID: <5A53AFDA.3020507@foss.arm.com> Date: Mon, 08 Jan 2018 17:52:26 +0000 From: Kyrill Tkachov <kyrylo.tkachov@foss.arm.com> User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:31.0) Gecko/20100101 Thunderbird/31.2.0 MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> CC: Christophe Lyon <christophe.lyon@linaro.org> Subject: [PATCH][arm][2/3] Implement fp16fml extension for ARMv8.4-A Content-Type: multipart/mixed; boundary="------------030606040705040909030809"
Series	[arm,1/3] Add -march=armv8.4-a option \| expand [arm,2/3] Implement fp16fml extension for ARMv8.4-A [arm,3/3] Implement fp16fml lane intrinsics [arm,1/3] Add -march=armv8.4-a option

diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c index 635bc3c1c38de79802041fc50229b90defd2e467..46dc8d51ffcd80983a70f1bd283caa3688648c9b 100644 --- a/gcc/config/arm/arm-c.c +++ b/gcc/config/arm/arm-c.c @@ -160,6 +160,7 @@ arm_cpu_builtins (struct cpp_reader* pfile) TARGET_VFP_FP16INST); def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", TARGET_NEON_FP16INST); + def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_FML", TARGET_FP16FML); def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA); def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON); diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in index 0967b9d2277a0d211452b7cd4d579db1774f29b3..7b9224b6b0791a9a7a315e1807b439604a3c0929 100644 --- a/gcc/config/arm/arm-cpus.in +++ b/gcc/config/arm/arm-cpus.in @@ -165,6 +165,9 @@ define feature fp16 # Dot Product instructions extension to ARMv8.2-a. define feature dotprod +# Half-precision floating-point instructions in ARMv8.4-A. +define feature fp16fml + # ISA Quirks (errata?). Don't forget to add this to the fgroup # ALL_QUIRKS below. @@ -202,7 +205,7 @@ define fgroup ALL_CRYPTO crypto # strip off 32 D-registers, but does not remove support for # double-precision FP. define fgroup ALL_SIMD_INTERNAL fp_d32 neon ALL_CRYPTO -define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod +define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod fp16fml # List of all FPU bits to strip out if -mfpu is used to override the # default. fp16 is deliberately missing from this list. @@ -581,6 +584,7 @@ begin arch armv8.2-a isa ARMv8_2a option simd add FP_ARMv8 NEON option fp16 add fp16 FP_ARMv8 NEON + option fp16fml add fp16fml fp16 FP_ARMv8 NEON option crypto add FP_ARMv8 CRYPTO option nocrypto remove ALL_CRYPTO option nofp remove ALL_FP @@ -595,6 +599,7 @@ begin arch armv8.3-a isa ARMv8_3a option simd add FP_ARMv8 NEON option fp16 add fp16 FP_ARMv8 NEON + option fp16fml add fp16fml fp16 FP_ARMv8 NEON option crypto add FP_ARMv8 CRYPTO option nocrypto remove ALL_CRYPTO option nofp remove ALL_FP @@ -608,7 +613,7 @@ begin arch armv8.4-a profile A isa ARMv8_4a option simd add FP_ARMv8 DOTPROD - option fp16 add fp16 FP_ARMv8 DOTPROD + option fp16 add fp16 fp16fml FP_ARMv8 DOTPROD option crypto add FP_ARMv8 CRYPTO DOTPROD option nocrypto remove ALL_CRYPTO option nofp remove ALL_FP diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def index f58a159a89852a82587ad974387468dab6c9be80..0fb22111cbb35fadd6642a5042779b7586870ceb 100644 --- a/gcc/config/arm/arm-modes.def +++ b/gcc/config/arm/arm-modes.def @@ -67,6 +67,7 @@ VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI */ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */ VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */ VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */ +VECTOR_MODE (FLOAT, HF, 2); /* V2HF */ /* Fraction and accumulator vector modes. */ VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */ diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h index 410bfb998419dd3b10d47cc143def5cfdc1b02a0..fbb5e9f38af50dbefb465b6ed370c050ae2f6274 100644 --- a/gcc/config/arm/arm.h +++ b/gcc/config/arm/arm.h @@ -216,10 +216,18 @@ extern tree arm_fp16_type_node; isa_bit_dotprod) \ && arm_arch8_2) -/* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */ +/* FPU supports the floating point FP16 instructions for ARMv8.2-A + and later. */ #define TARGET_VFP_FP16INST \ (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst) +/* Target supports the floating point FP16 instructions from ARMv8.2-A + and later. */ +#define TARGET_FP16FML (TARGET_NEON \ + && bitmap_bit_p (arm_active_target.isa, \ + isa_bit_fp16fml) \ + && arm_arch8_2) + /* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later. */ #define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA) diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 649182b6a776220cf3aba1b5b1b023e4ccf7857f..01324096d673187b504b89a6d68785275b445b1b 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -18104,6 +18104,69 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) #pragma GCC pop_options #endif +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+fp16fml") + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_neon_vfmal_lowv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_neon_vfmsl_lowv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlal_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_neon_vfmal_highv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b) +{ + return __builtin_neon_vfmsl_highv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_neon_vfmal_lowv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_neon_vfmsl_lowv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlalq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_neon_vfmal_highv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b) +{ + return __builtin_neon_vfmsl_highv4sf (__r, __a, __b); +} + +#pragma GCC pop_options +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 982eec810dafb5ec955273099853f8842020d104..d4fe33b0502f46b9d6303a08003ede2c69574e29 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -51,6 +51,10 @@ VAR2 (TERNOP, vqdmlal, v4hi, v2si) VAR2 (TERNOP, vqdmlsl, v4hi, v2si) VAR4 (TERNOP, vqrdmlah, v4hi, v2si, v8hi, v4si) VAR4 (TERNOP, vqrdmlsh, v4hi, v2si, v8hi, v4si) +VAR2 (TERNOP, vfmal_low, v2sf, v4sf) +VAR2 (TERNOP, vfmal_high, v2sf, v4sf) +VAR2 (TERNOP, vfmsl_low, v2sf, v4sf) +VAR2 (TERNOP, vfmsl_high, v2sf, v4sf) VAR3 (BINOP, vmullp, v8qi, v4hi, v2si) VAR3 (BINOP, vmulls, v8qi, v4hi, v2si) VAR3 (BINOP, vmullu, v8qi, v4hi, v2si) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index a4fb234a846795e1c0dd5bf7de76ff7da487be23..efa410e4fbd301e0e43d5364bb3bd59e676962bf 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -247,6 +247,9 @@ (define_code_iterator SHIFTABLE_OPS [plus minus ior xor and]) ;; Operations on the sign of a number. (define_code_iterator ABSNEG [abs neg]) +;; The PLUS and MINUS operators. +(define_code_iterator PLUSMINUS [plus minus]) + ;; Conversions. (define_code_iterator FCVT [unsigned_float float]) @@ -266,6 +269,8 @@ (define_code_attr cmp_op [(eq "eq") (gt "gt") (ge "ge") (lt "lt") (le "le") (define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")]) +(define_code_attr vfml_op [(plus "a") (minus "s")]) + ;;---------------------------------------------------------------------------- ;; Int iterators ;;---------------------------------------------------------------------------- @@ -412,6 +417,8 @@ (define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE]) (define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U]) +(define_int_iterator VFMLHALVES [UNSPEC_VFML_LO UNSPEC_VFML_HI]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -471,6 +478,12 @@ (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) +;; Mode mapping for VFM[A,S]L instructions. +(define_mode_attr VFML [(V2SF "V4HF") (V4SF "V8HF")]) + +;; Mode mapping for VFM[A,S]L instructions for the vec_select result. +(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")]) + ;; Similar, for three elements. (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") @@ -494,8 +507,14 @@ (define_mode_attr V_reg [(V8QI "P") (V16QI "q") (V2SI "P") (V4SI "q") (V2SF "P") (V4SF "q") (DI "P") (V2DI "q") - (SF "") (DF "P") - (HF "")]) + (V2HF "") (SF "") + (DF "P") (HF "")]) + +;; Output template to select the high VFP register of a mult-register value. +(define_mode_attr V_hi [(V2SF "p") (V4SF "f")]) + +;; Output template to select the low VFP register of a mult-register value. +(define_mode_attr V_lo [(V2SF "") (V4SF "e")]) ;; Wider modes with the same number of elements. (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) @@ -708,6 +727,7 @@ (define_mode_attr V_innermode [(V8QI "QI") (V4HI "HI") (V2SI "SI")]) (define_mode_attr F_constraint [(SF "t") (DF "w")]) (define_mode_attr vfp_type [(SF "s") (DF "d")]) (define_mode_attr vfp_double_cond [(SF "") (DF "&& TARGET_VFP_DOUBLE")]) +(define_mode_attr VF_constraint [(V2SF "t") (V4SF "w")]) ;; Mode attribute used to build the "type" attribute. (define_mode_attr q [(V8QI "") (V16QI "_q") @@ -824,6 +844,12 @@ (define_int_attr sup [ (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u") ]) +(define_int_attr vfml_half + [(UNSPEC_VFML_HI "high") (UNSPEC_VFML_LO "low")]) + +(define_int_attr vfml_half_selector + [(UNSPEC_VFML_HI "true") (UNSPEC_VFML_LO "false")]) + (define_int_attr vcvth_op [(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a") (UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m") diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 073c26580dd317a01cd0e275965fee2ef83ae3f9..75be5aca8b57ad744d70253989073756bf4ca1fe 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -2290,6 +2290,98 @@ (define_expand "neon_vfms<VH:mode>" DONE; }) +;; The expand RTL structure here is not important. +;; We use the gen_* functions anyway. +;; We just need something to wrap the iterators around. + +(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>" + [(set (match_operand:VCVTF 0 "s_register_operand") + (unspec:VCVTF + [(match_operand:VCVTF 1 "s_register_operand") + (PLUSMINUS:<VFML> + (match_operand:<VFML> 2 "s_register_operand") + (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))] + "TARGET_FP16FML" +{ + rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>); + emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0], + operands[1], + operands[2], + operands[3], + half, half)); + DONE; +}) + +(define_insn "vfmal_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" ""))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_low" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" "")))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_high" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmal_high<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_high" ""))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_high" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + +(define_insn "vfmsl_low<mode>_intrinsic" + [(set (match_operand:VCVTF 0 "s_register_operand" "=w") + (fma:VCVTF + (float_extend:VCVTF + (neg:<VFMLSEL> + (vec_select:<VFMLSEL> + (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 4 "vect_par_constant_low" "")))) + (float_extend:VCVTF + (vec_select:<VFMLSEL> + (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>") + (match_operand:<VFML> 5 "vect_par_constant_low" ""))) + (match_operand:VCVTF 1 "s_register_operand" "0")))] + "TARGET_FP16FML" + "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3" + [(set_attr "type" "neon_fp_mla_s<q>")] +) + ; Used for intrinsics when flag_unsafe_math_optimizations is false. (define_insn "neon_vmla<mode>_unspec" diff --git a/gcc/config/arm/t-arm-elf b/gcc/config/arm/t-arm-elf index 3e721ec789806335c6097d4088642150abf1003a..d9fc9f1a5615ffb036dde18cd8e34c22b5c874cd 100644 --- a/gcc/config/arm/t-arm-elf +++ b/gcc/config/arm/t-arm-elf @@ -36,7 +36,7 @@ v7ve_fps := vfpv3-d16 vfpv3 vfpv3-d16-fp16 vfpv3-fp16 vfpv4 neon \ # Not all these permutations exist for all architecture variants, but # it seems to work ok. -v8_fps := simd fp16 crypto fp16+crypto dotprod +v8_fps := simd fp16 crypto fp16+crypto dotprod fp16fml # We don't do anything special with these. Pre-v4t probably doesn't work. all_early_nofp := armv2 armv2a armv3 armv3m armv4 armv4t armv5 armv5t diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib index 26b8ae15da74b275e5617b3054572d2a7e8cfe49..09c451e8ca6ee128989a331ef5870031c56b3e3b 100644 --- a/gcc/config/arm/t-multilib +++ b/gcc/config/arm/t-multilib @@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd v8_a_nosimd_variants := +crc v8_a_simd_variants := $(call all_feat_combs, simd crypto) v8_1_a_simd_variants := $(call all_feat_combs, simd crypto) -v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto dotprod) +v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 fp16fml crypto dotprod) v8_4_a_simd_variants := $(call all_feat_combs, simd fp16 crypto) ifneq (,$(HAS_APROFILE)) diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index c474f4bb5db995b60f464f098e478f0398ce15f9..cc2309e751676ea7a15e6c4709e9bc55a32b51b3 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -412,4 +412,6 @@ (define_c_enum "unspec" [ UNSPEC_VRNDX UNSPEC_DOT_S UNSPEC_DOT_U + UNSPEC_VFML_LO + UNSPEC_VFML_HI ]) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 9c2388aae2b813c675bf4b697cfd80e79cbfdb78..0de15e70d86b2a9752911d1e3c2b62aea414d7ad 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -15860,6 +15860,11 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions. The half-precision floating-point data processing instructions. This also enables the Advanced SIMD and floating-point instructions. +@item +fp16fml +The half-precision floating-point fmla extension. This also enables +the half-precision floating-point extension and Advanced SIMD and +floating-point instructions. + @item +simd The ARMv8.1-A Advanced SIMD and floating-point instructions. @@ -15882,7 +15887,8 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions. @item +fp16 The half-precision floating-point data processing instructions. This also enables the Advanced SIMD and floating-point instructions as well -as the Dot Product extension. +as the Dot Product extension and the half-precision floating-point fmla +extension. @item +simd The ARMv8.3-A Advanced SIMD and floating-point instructions as well as the diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index d5da39334e4a4d1aae9a6222883a413d1a315020..2c9438c3491f71abed5deb71f70ba5f6aae918ac 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -1769,6 +1769,12 @@ ARM target supports executing instructions from ARMv8.2-A with the Dot Product extension. Some multilibs may be incompatible with these options. Implies arm_v8_2a_dotprod_neon_ok. +@item arm_fp16fml_neon_ok +@anchor{arm_fp16fml_neon_ok} +ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS} +half-precision floating-point instructions available from ARMv8.2-A and +onwards. Some multilibs may be incompatible with these options. + @item arm_prefer_ldrd_strd ARM target prefers @code{LDRD} and @code{STRD} instructions over @code{LDM} and @code{STM} instructions. @@ -2384,6 +2390,11 @@ Add options for ARMv8.2-A with Adv.SIMD Dot Product support, if this is supported by the target; see the @ref{arm_v8_2a_dotprod_neon_ok} effective target keyword. +@item arm_fp16fml_neon +Add options to enable generation of the @code{VFMAL} and @code{VFMSL} +instructions, if this is supported by the target; see the +@ref{arm_fp16fml_neon_ok} effective target keyword. + @item bind_pic_locally Add the target-specific flags needed to enable functions to bind locally when using pic/PIC passes in the testsuite. diff --git a/gcc/testsuite/gcc.target/arm/multilib.exp b/gcc/testsuite/gcc.target/arm/multilib.exp index b210f32f680a673bedd3dc16ae74fefe70a403e4..4e3324f23114c751b99d41d85dd14e0cb5d79145 100644 --- a/gcc/testsuite/gcc.target/arm/multilib.exp +++ b/gcc/testsuite/gcc.target/arm/multilib.exp @@ -92,6 +92,14 @@ if {[multilib_config "aprofile"] } { {-march=armv8.3-a+simd+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" {-march=armv8.3-a+simd+dotprod+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp" {-march=armv8.3-a+simd+nofp+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" + {-march=armv8.2-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp" + {-march=armv8.2-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" + {-march=armv8.2-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp" + {-march=armv8.2-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" + {-march=armv8.3-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp" + {-march=armv8.3-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" + {-march=armv8.3-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp" + {-march=armv8.3-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" {-march=armv8.4-a+crypto -mfloat-abi=soft} "thumb/v8-a/nofp" {-march=armv8.4-a+simd+crypto -mfloat-abi=softfp} "thumb/v8-a+simd/softfp" {-march=armv8.4-a+simd+crypto+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp" diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c new file mode 100644 index 0000000000000000000000000000000000000000..0f50a57f42836dfd93d9dd2b52001fc6d6356744 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_fp16fml_neon_ok } */ +/* { dg-add-options arm_fp16fml_neon } */ + +#include "arm_neon.h" + +float32x2_t +test_vfmlal_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlal_high_u32 (r, a, b); +} + +float32x4_t +test_vfmlalq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlalq_high_u32 (r, a, b); +} + +float32x2_t +test_vfmlsl_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlsl_high_u32 (r, a, b); +} + +float32x4_t +test_vfmlslq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlslq_high_u32 (r, a, b); +} + +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c new file mode 100644 index 0000000000000000000000000000000000000000..427331c8684ca5f0cc47272e4c30e23908995f33 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_fp16fml_neon_ok } */ +/* { dg-add-options arm_fp16fml_neon } */ + +#include "arm_neon.h" + +float32x2_t +test_vfmlal_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlal_low_u32 (r, a, b); +} + +float32x4_t +test_vfmlalq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlalq_low_u32 (r, a, b); +} + +float32x2_t +test_vfmlsl_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b) +{ + return vfmlsl_low_u32 (r, a, b); +} + +float32x4_t +test_vfmlslq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b) +{ + return vfmlslq_low_u32 (r, a, b); +} + +/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */ +/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */ diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 114c1f11ccc00fb50a595e613122429e09c5925f..14517a840ea8870c8fab496637fa03c9c01634b9 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -4442,6 +4442,48 @@ proc add_options_for_arm_v8_2a_dotprod_neon { flags } { return "$flags $et_arm_v8_2a_dotprod_neon_flags" } +# Return 1 if the target supports FP16 VFMAL and VFMSL +# instructions, 0 otherwise. +# Record the command line options needed. + +proc check_effective_target_arm_fp16fml_neon_ok_nocache { } { + global et_arm_fp16fml_neon_flags + set et_arm_fp16fml_neon_flags "" + + if { ![istarget arm*-*-*] } { + return 0; + } + + # Iterate through sets of options to find the compiler flags that + # need to be added to the -march option. + foreach flags {"" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" "-mfloat-abi=hard -mfpu=neon-fp-armv8"} { + if { [check_no_compiler_messages_nocache \ + arm_fp16fml_neon_ok object { + #if !defined (__ARM_FEATURE_FP16_FML) + #error "__ARM_FEATURE_FP16_FML not defined" + #endif + } "$flags -march=armv8.2-a+fp16fml"] } { + set et_arm_fp16fml_neon_flags "$flags -march=armv8.2-a+fp16fml" + return 1 + } + } + + return 0; +} + +proc check_effective_target_arm_fp16fml_neon_ok { } { + return [check_cached_effective_target arm_fp16fml_neon_ok \ + check_effective_target_arm_fp16fml_neon_ok_nocache] +} + +proc add_options_for_arm_fp16fml_neon { flags } { + if { ! [check_effective_target_arm_fp16fml_neon_ok] } { + return "$flags" + } + global et_arm_fp16fml_neon_flags + return "$flags $et_arm_fp16fml_neon_flags" +} + # Return 1 if the target supports executing ARMv8 NEON instructions, 0 # otherwise.

[arm,2/3] Implement fp16fml extension for ARMv8.4-A

Commit Message

Patch