[ARM] Add support for fma intrinsics for Neon

Message ID	507FF551.7070800@arm.com
State	New
Headers	show Return-Path: <gcc-patches-return-329376-incoming=patchwork.ozlabs.org@gcc.gnu.org> Comment: DKIM? See http://www.dkim.org Comment: DomainKeys? See http://antispam.yahoo.com/domainkeys DomainKey-Signature: a=rsa-sha1; q=dns; c=nofws; s=default; d=gcc.gnu.org; h=Received:Received:X-SWARE-Spam-Status:X-Spam-Check-By:Received:Received:Received:Message-ID:Date:From:Reply-To:User-Agent:MIME-Version:To:Subject:References:In-Reply-To:X-MC-Unique:Content-Type:X-IsSubscribed:Mailing-List:Precedence:List-Id:List-Unsubscribe:List-Archive:List-Post:List-Help:Sender:Delivered-To; b=P1SohjVRYIJ5ZVtPMf5G01jS6T7ZJeUd9g856LhHuLy5qwETfSAXcEllJGAN49 XdlUaIQaHDqtJ4c8ITlQLxAHJqQHZY1LU5nTw20LUsx1kUa4wUXnMdCA5lSlpv3+ INyr0g/LTxR5i9jlwI5Z8EyVPBmz1nXogJyBCmM1hizhM=; Message-ID: <507FF551.7070800@arm.com> Date: Thu, 18 Oct 2012 13:25:53 +0100 From: Ramana Radhakrishnan <ramrad01@arm.com> Reply-To: ramrad01@arm.com User-Agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:14.0) Gecko/20120713 Thunderbird/14.0 MIME-Version: 1.0 To: gcc-patches@gcc.gnu.org Subject: Re: [Patch ARM] Add support for fma intrinsics for Neon References: <507FF08A.9010608@arm.com> In-Reply-To: <507FF08A.9010608@arm.com> Content-Type: multipart/mixed; boundary="------------090108060500070807000200" Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

From ae8086e2d1ecae8fee711942d4b530947001e8ef Mon Sep 17 00:00:00 2001 From: Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org> Date: Wed, 17 Oct 2012 18:40:14 +0100 Subject: [PATCH 2/3] neon fma intrinsics. --- gcc/config/arm/arm.c | 2 + gcc/config/arm/arm_neon.h | 32 +++++ gcc/config/arm/neon-docgen.ml | 2 + gcc/config/arm/neon-gen.ml | 24 ++++- gcc/config/arm/neon-testgen.ml | 22 +++- gcc/config/arm/neon.md | 56 ++++++++ gcc/config/arm/neon.ml | 10 ++ gcc/doc/arm-neon-intrinsics.texi | 180 +++++++++++++++----------- gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c | 22 +++ gcc/testsuite/gcc.target/arm/neon/vfmaf32.c | 22 +++ gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c | 22 +++ gcc/testsuite/gcc.target/arm/neon/vfmsf32.c | 22 +++ 12 files changed, 337 insertions(+), 79 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmaf32.c create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmsf32.c diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 866385c..5fc7d67 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -18718,6 +18718,8 @@ static neon_builtin_datum neon_builtin_data[] = VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf), VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf), VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si), + VAR2 (TERNOP, vfma, v2sf, v4sf), + VAR2 (TERNOP, vfms, v2sf, v4sf), VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf), VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si), VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si), diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index b486d57..8fec83f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -1350,6 +1350,38 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1); } +#ifdef __ARM_FEATURE_FMA +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c, 3); +} + +#endif +#ifdef __ARM_FEATURE_FMA +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c, 3); +} + +#endif +#ifdef __ARM_FEATURE_FMA +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c, 3); +} + +#endif +#ifdef __ARM_FEATURE_FMA +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c, 3); +} + +#endif __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vsub_s8 (int8x8_t __a, int8x8_t __b) { diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml index 23e37b4..043b1e0 100644 --- a/gcc/config/arm/neon-docgen.ml +++ b/gcc/config/arm/neon-docgen.ml @@ -103,6 +103,8 @@ let intrinsic_groups = "Multiplication", single_opcode Vmul; "Multiply-accumulate", single_opcode Vmla; "Multiply-subtract", single_opcode Vmls; + "Fused-multiply-accumulate", single_opcode Vfma; + "Fused-multiply-subtract", single_opcode Vfms; "Subtraction", single_opcode Vsub; "Comparison (equal-to)", single_opcode Vceq; "Comparison (greater-than-or-equal-to)", single_opcode Vcge; diff --git a/gcc/config/arm/neon-gen.ml b/gcc/config/arm/neon-gen.ml index 29679aa..6c4e272 100644 --- a/gcc/config/arm/neon-gen.ml +++ b/gcc/config/arm/neon-gen.ml @@ -286,6 +286,24 @@ let get_shuffle features = | _ -> None with Not_found -> None +let print_feature_test_start features = + try + match List.find (fun feature -> + match feature with Requires_feature _ -> true + | _ -> false) + features with + Requires_feature feature -> + Format.printf "#ifdef __ARM_FEATURE_%s@\n" feature + | _ -> assert false + with Not_found -> assert true + +let print_feature_test_end features = + let feature = + List.exists (function Requires_feature x -> true + | _ -> false) features in + if feature then Format.printf "#endif@\n" + + let print_variant opcode features shape name (ctype, asmtype, elttype) = let bits = infoword_value elttype features in let modesuf = mode_suffix elttype shape in @@ -302,7 +320,11 @@ let print_variant opcode features shape name (ctype, asmtype, elttype) = return ctype builtin in let body = pdecls @ rdecls @ stmts and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in - print_function ctype fnname body + begin + print_feature_test_start features; + print_function ctype fnname body; + print_feature_test_end features; + end (* When this function processes the element types in the ops table, it rewrites them in a list of tuples (a,b,c): diff --git a/gcc/config/arm/neon-testgen.ml b/gcc/config/arm/neon-testgen.ml index a69a539..4645f39 100644 --- a/gcc/config/arm/neon-testgen.ml +++ b/gcc/config/arm/neon-testgen.ml @@ -46,13 +46,14 @@ let open_test_file dir name = failwith ("Could not create test source file " ^ name ^ ": " ^ str) (* Emit prologue code to a test source file. *) -let emit_prologue chan test_name = +let emit_prologue chan test_name effective_target = Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic. */\n" test_name; Printf.fprintf chan "/* This file was autogenerated by neon-testgen. */\n\n"; Printf.fprintf chan "/* { dg-do assemble } */\n"; - Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n"; + Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n" + effective_target; Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n"; - Printf.fprintf chan "/* { dg-add-options arm_neon } */\n"; + Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target; Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n"; Printf.fprintf chan "void test_%s (void)\n{\n" test_name @@ -156,6 +157,17 @@ let check_types tys = then (Const :: flags, String.sub ty 6 ((String.length ty) - 6)) else (flags, ty)) tys' +(* Work out what the effective target should be. *) +let effective_target features = + try + match List.find (fun feature -> + match feature with Requires_feature _ -> true + | _ -> false) + features with + Requires_feature "FMA" -> "arm_neonv2" + | _ -> assert false + with Not_found -> "arm_neon" + (* Given an intrinsic shape, produce a regexp that will match the right-hand sides of instructions generated by an intrinsic of that shape. *) @@ -263,8 +275,10 @@ let test_intrinsic dir opcode features shape name munge elt_ty = "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n") (analyze_all_shapes features shape analyze_shape) in + let effective_target = effective_target features + in (* Emit file and function prologues. *) - emit_prologue chan test_name; + emit_prologue chan test_name effective_target; (* Emit local variable declarations. *) emit_automatics chan c_types features; Printf.fprintf chan "\n"; diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index b89d538..92e03b0 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -722,6 +722,10 @@ ) ;; Fused multiply-accumulate +;; We define each insn twice here: +;; 1: with flag_unsafe_math_optimizations for the widening multiply phase +;; to be able to use when converting to FMA. +;; 2: without flag_unsafe_math_optimizations for the intrinsics to use. (define_insn "fma<VCVTF:mode>4" [(set (match_operand:VCVTF 0 "register_operand" "=w") (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w") @@ -735,6 +739,19 @@ (const_string "neon_fp_vmla_qqq")))] ) +(define_insn "fma<VCVTF:mode>4_intrinsic" + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w") + (match_operand:VCVTF 2 "register_operand" "w") + (match_operand:VCVTF 3 "register_operand" "0")))] + "TARGET_NEON && TARGET_FMA" + "vfma%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" + [(set (attr "neon_type") + (if_then_else (match_test "<Is_d_reg>") + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")))] +) + (define_insn "*fmsub<VCVTF:mode>4" [(set (match_operand:VCVTF 0 "register_operand" "=w") (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) @@ -748,6 +765,19 @@ (const_string "neon_fp_vmla_qqq")))] ) +(define_insn "fmsub<VCVTF:mode>4_intrinsic" + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) + (match_operand:VCVTF 2 "register_operand" "w") + (match_operand:VCVTF 3 "register_operand" "0")))] + "TARGET_NEON && TARGET_FMA" + "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2" + [(set (attr "neon_type") + (if_then_else (match_test "<Is_d_reg>") + (const_string "neon_fp_vmla_ddd") + (const_string "neon_fp_vmla_qqq")))] +) + (define_insn "ior<mode>3" [(set (match_operand:VDQ 0 "s_register_operand" "=w,w") (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0") @@ -1925,6 +1955,32 @@ DONE; }) +(define_expand "neon_vfma<VCVTF:mode>" + [(match_operand:VCVTF 0 "s_register_operand") + (match_operand:VCVTF 1 "s_register_operand") + (match_operand:VCVTF 2 "s_register_operand") + (match_operand:VCVTF 3 "s_register_operand") + (match_operand:SI 4 "immediate_operand")] + "TARGET_NEON && TARGET_FMA" +{ + emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3], + operands[1])); + DONE; +}) + +(define_expand "neon_vfms<VCVTF:mode>" + [(match_operand:VCVTF 0 "s_register_operand") + (match_operand:VCVTF 1 "s_register_operand") + (match_operand:VCVTF 2 "s_register_operand") + (match_operand:VCVTF 3 "s_register_operand") + (match_operand:SI 4 "immediate_operand")] + "TARGET_NEON && TARGET_FMA" +{ + emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3], + operands[1])); + DONE; +}) + ; Used for intrinsics when flag_unsafe_math_optimizations is false. (define_insn "neon_vmla<mode>_unspec" diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml index 56869c0..101f8f6 100644 --- a/gcc/config/arm/neon.ml +++ b/gcc/config/arm/neon.ml @@ -102,6 +102,8 @@ type opcode = | Vmul | Vmla | Vmls + | Vfma + | Vfms | Vsub | Vceq | Vcge @@ -275,6 +277,8 @@ type features = | Const_valuator of (int -> int) | Fixed_vector_reg | Fixed_core_reg + (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined. *) + | Requires_feature of string exception MixedMode of elts * elts @@ -802,6 +806,12 @@ let ops = Vmls, [], Long, "vmlsl", elts_same_io, su_8_32; Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32]; + (* Fused-multiply-accumulate. *) + Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32]; + Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32]; + Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32]; + Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32]; + (* Subtraction. *) Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32; Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2, [S64; U64]; diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi index a75e582..14e6264 100644 --- a/gcc/doc/arm-neon-intrinsics.texi +++ b/gcc/doc/arm-neon-intrinsics.texi @@ -972,6 +972,38 @@ +@subsubsection Fused-multiply-accumulate + +@itemize @bullet +@item float32x2_t vfma_f32 (float32x2_t, float32x2_t, float32x2_t) +@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{d0}, @var{d0}, @var{d0}} +@end itemize + + +@itemize @bullet +@item float32x4_t vfmaq_f32 (float32x4_t, float32x4_t, float32x4_t) +@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{q0}, @var{q0}, @var{q0}} +@end itemize + + + + +@subsubsection Fused-multiply-subtract + +@itemize @bullet +@item float32x2_t vfms_f32 (float32x2_t, float32x2_t, float32x2_t) +@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{d0}, @var{d0}, @var{d0}} +@end itemize + + +@itemize @bullet +@item float32x4_t vfmsq_f32 (float32x4_t, float32x4_t, float32x4_t) +@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{q0}, @var{q0}, @var{q0}} +@end itemize + + + + @subsubsection Subtraction @itemize @bullet @@ -1497,24 +1529,6 @@ @subsubsection Comparison (greater-than-or-equal-to) @itemize @bullet -@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}} -@end itemize - - -@itemize @bullet -@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}} -@end itemize - - -@itemize @bullet -@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}} -@end itemize - - -@itemize @bullet @item uint32x2_t vcge_s32 (int32x2_t, int32x2_t) @*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}} @end itemize @@ -1539,20 +1553,20 @@ @itemize @bullet -@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}} +@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}} +@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}} +@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}} @end itemize @@ -1580,28 +1594,28 @@ @end itemize - - -@subsubsection Comparison (less-than-or-equal-to) - @itemize @bullet -@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}} +@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}} +@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}} +@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}} @end itemize + + +@subsubsection Comparison (less-than-or-equal-to) + @itemize @bullet @item uint32x2_t vcle_s32 (int32x2_t, int32x2_t) @*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}} @@ -1627,20 +1641,20 @@ @itemize @bullet -@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}} +@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}} +@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t) -@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}} +@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}} @end itemize @@ -1668,28 +1682,28 @@ @end itemize - - -@subsubsection Comparison (greater-than) - @itemize @bullet -@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}} +@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}} +@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}} +@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t) +@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}} @end itemize + + +@subsubsection Comparison (greater-than) + @itemize @bullet @item uint32x2_t vcgt_s32 (int32x2_t, int32x2_t) @*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}} @@ -1715,20 +1729,20 @@ @itemize @bullet -@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}} +@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}} +@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}} +@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}} @end itemize @@ -1756,28 +1770,28 @@ @end itemize - - -@subsubsection Comparison (less-than) - @itemize @bullet -@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}} +@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}} +@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}} @end itemize @itemize @bullet -@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}} +@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}} @end itemize + + +@subsubsection Comparison (less-than) + @itemize @bullet @item uint32x2_t vclt_s32 (int32x2_t, int32x2_t) @*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}} @@ -1803,20 +1817,20 @@ @itemize @bullet -@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}} +@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}} +@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}} @end itemize @itemize @bullet -@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t) -@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}} +@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}} @end itemize @@ -1844,6 +1858,24 @@ @end itemize +@itemize @bullet +@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}} +@end itemize + + +@itemize @bullet +@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}} +@end itemize + + +@itemize @bullet +@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t) +@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}} +@end itemize + + @subsubsection Comparison (absolute greater-than-or-equal-to) @@ -4810,13 +4842,13 @@ @itemize @bullet @item uint64_t vgetq_lane_u64 (uint64x2_t, const int) -@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} +@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}} @end itemize @itemize @bullet @item int64_t vgetq_lane_s64 (int64x2_t, const int) -@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} +@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}} @end itemize diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c new file mode 100644 index 0000000..d400163 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c @@ -0,0 +1,22 @@ +/* Test the `vfmaQf32' ARM Neon intrinsic. */ +/* This file was autogenerated by neon-testgen. */ + +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_neonv2_ok } */ +/* { dg-options "-save-temps -O0" } */ +/* { dg-add-options arm_neonv2 } */ + +#include "arm_neon.h" + +void test_vfmaQf32 (void) +{ + float32x4_t out_float32x4_t; + float32x4_t arg0_float32x4_t; + float32x4_t arg1_float32x4_t; + float32x4_t arg2_float32x4_t; + + out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t); +} + +/* { dg-final { scan-assembler "vfma\.f32\[ \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c new file mode 100644 index 0000000..988328d --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon/vfmaf32.c @@ -0,0 +1,22 @@ +/* Test the `vfmaf32' ARM Neon intrinsic. */ +/* This file was autogenerated by neon-testgen. */ + +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_neonv2_ok } */ +/* { dg-options "-save-temps -O0" } */ +/* { dg-add-options arm_neonv2 } */ + +#include "arm_neon.h" + +void test_vfmaf32 (void) +{ + float32x2_t out_float32x2_t; + float32x2_t arg0_float32x2_t; + float32x2_t arg1_float32x2_t; + float32x2_t arg2_float32x2_t; + + out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t); +} + +/* { dg-final { scan-assembler "vfma\.f32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c new file mode 100644 index 0000000..247a8ed --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c @@ -0,0 +1,22 @@ +/* Test the `vfmsQf32' ARM Neon intrinsic. */ +/* This file was autogenerated by neon-testgen. */ + +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_neonv2_ok } */ +/* { dg-options "-save-temps -O0" } */ +/* { dg-add-options arm_neonv2 } */ + +#include "arm_neon.h" + +void test_vfmsQf32 (void) +{ + float32x4_t out_float32x4_t; + float32x4_t arg0_float32x4_t; + float32x4_t arg1_float32x4_t; + float32x4_t arg2_float32x4_t; + + out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t); +} + +/* { dg-final { scan-assembler "vfms\.f32\[ \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c new file mode 100644 index 0000000..7f9e857 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon/vfmsf32.c @@ -0,0 +1,22 @@ +/* Test the `vfmsf32' ARM Neon intrinsic. */ +/* This file was autogenerated by neon-testgen. */ + +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_neonv2_ok } */ +/* { dg-options "-save-temps -O0" } */ +/* { dg-add-options arm_neonv2 } */ + +#include "arm_neon.h" + +void test_vfmsf32 (void) +{ + float32x2_t out_float32x2_t; + float32x2_t arg0_float32x2_t; + float32x2_t arg1_float32x2_t; + float32x2_t arg2_float32x2_t; + + out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t); +} + +/* { dg-final { scan-assembler "vfms\.f32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */ +/* { dg-final { cleanup-saved-temps } } */ -- 1.7.4.1

[ARM] Add support for fma intrinsics for Neon

Commit Message

Patch