From ae8086e2d1ecae8fee711942d4b530947001e8ef Mon Sep 17 00:00:00 2001
From: Ramana Radhakrishnan <ramana.radhakrishnan@linaro.org>
Date: Wed, 17 Oct 2012 18:40:14 +0100
Subject: [PATCH 2/3] neon fma intrinsics.
---
gcc/config/arm/arm.c | 2 +
gcc/config/arm/arm_neon.h | 32 +++++
gcc/config/arm/neon-docgen.ml | 2 +
gcc/config/arm/neon-gen.ml | 24 ++++-
gcc/config/arm/neon-testgen.ml | 22 +++-
gcc/config/arm/neon.md | 56 ++++++++
gcc/config/arm/neon.ml | 10 ++
gcc/doc/arm-neon-intrinsics.texi | 180 +++++++++++++++-----------
gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c | 22 +++
gcc/testsuite/gcc.target/arm/neon/vfmaf32.c | 22 +++
gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c | 22 +++
gcc/testsuite/gcc.target/arm/neon/vfmsf32.c | 22 +++
12 files changed, 337 insertions(+), 79 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmaQf32.c
create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmaf32.c
create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmsQf32.c
create mode 100644 gcc/testsuite/gcc.target/arm/neon/vfmsf32.c
@@ -18718,6 +18718,8 @@ static neon_builtin_datum neon_builtin_data[] =
VAR8 (BINOP, vmul, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
VAR8 (TERNOP, vmla, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
VAR3 (TERNOP, vmlal, v8qi, v4hi, v2si),
+ VAR2 (TERNOP, vfma, v2sf, v4sf),
+ VAR2 (TERNOP, vfms, v2sf, v4sf),
VAR8 (TERNOP, vmls, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
VAR3 (TERNOP, vmlsl, v8qi, v4hi, v2si),
VAR4 (BINOP, vqdmulh, v4hi, v2si, v8hi, v4si),
@@ -1350,6 +1350,38 @@ vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
}
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return (float32x2_t)__builtin_neon_vfmav2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return (float32x4_t)__builtin_neon_vfmav4sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return (float32x2_t)__builtin_neon_vfmsv2sf (__a, __b, __c, 3);
+}
+
+#endif
+#ifdef __ARM_FEATURE_FMA
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return (float32x4_t)__builtin_neon_vfmsv4sf (__a, __b, __c, 3);
+}
+
+#endif
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vsub_s8 (int8x8_t __a, int8x8_t __b)
{
@@ -103,6 +103,8 @@ let intrinsic_groups =
"Multiplication", single_opcode Vmul;
"Multiply-accumulate", single_opcode Vmla;
"Multiply-subtract", single_opcode Vmls;
+ "Fused-multiply-accumulate", single_opcode Vfma;
+ "Fused-multiply-subtract", single_opcode Vfms;
"Subtraction", single_opcode Vsub;
"Comparison (equal-to)", single_opcode Vceq;
"Comparison (greater-than-or-equal-to)", single_opcode Vcge;
@@ -286,6 +286,24 @@ let get_shuffle features =
| _ -> None
with Not_found -> None
+let print_feature_test_start features =
+ try
+ match List.find (fun feature ->
+ match feature with Requires_feature _ -> true
+ | _ -> false)
+ features with
+ Requires_feature feature ->
+ Format.printf "#ifdef __ARM_FEATURE_%s@\n" feature
+ | _ -> assert false
+ with Not_found -> assert true
+
+let print_feature_test_end features =
+ let feature =
+ List.exists (function Requires_feature x -> true
+ | _ -> false) features in
+ if feature then Format.printf "#endif@\n"
+
+
let print_variant opcode features shape name (ctype, asmtype, elttype) =
let bits = infoword_value elttype features in
let modesuf = mode_suffix elttype shape in
@@ -302,7 +320,11 @@ let print_variant opcode features shape name (ctype, asmtype, elttype) =
return ctype builtin in
let body = pdecls @ rdecls @ stmts
and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
- print_function ctype fnname body
+ begin
+ print_feature_test_start features;
+ print_function ctype fnname body;
+ print_feature_test_end features;
+ end
(* When this function processes the element types in the ops table, it rewrites
them in a list of tuples (a,b,c):
@@ -46,13 +46,14 @@ let open_test_file dir name =
failwith ("Could not create test source file " ^ name ^ ": " ^ str)
(* Emit prologue code to a test source file. *)
-let emit_prologue chan test_name =
+let emit_prologue chan test_name effective_target =
Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic. */\n" test_name;
Printf.fprintf chan "/* This file was autogenerated by neon-testgen. */\n\n";
Printf.fprintf chan "/* { dg-do assemble } */\n";
- Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+ Printf.fprintf chan "/* { dg-require-effective-target %s_ok } */\n"
+ effective_target;
Printf.fprintf chan "/* { dg-options \"-save-temps -O0\" } */\n";
- Printf.fprintf chan "/* { dg-add-options arm_neon } */\n";
+ Printf.fprintf chan "/* { dg-add-options %s } */\n" effective_target;
Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
Printf.fprintf chan "void test_%s (void)\n{\n" test_name
@@ -156,6 +157,17 @@ let check_types tys =
then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
else (flags, ty)) tys'
+(* Work out what the effective target should be. *)
+let effective_target features =
+ try
+ match List.find (fun feature ->
+ match feature with Requires_feature _ -> true
+ | _ -> false)
+ features with
+ Requires_feature "FMA" -> "arm_neonv2"
+ | _ -> assert false
+ with Not_found -> "arm_neon"
+
(* Given an intrinsic shape, produce a regexp that will match
the right-hand sides of instructions generated by an intrinsic of
that shape. *)
@@ -263,8 +275,10 @@ let test_intrinsic dir opcode features shape name munge elt_ty =
"!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
(analyze_all_shapes features shape analyze_shape)
in
+ let effective_target = effective_target features
+ in
(* Emit file and function prologues. *)
- emit_prologue chan test_name;
+ emit_prologue chan test_name effective_target;
(* Emit local variable declarations. *)
emit_automatics chan c_types features;
Printf.fprintf chan "\n";
@@ -722,6 +722,10 @@
)
;; Fused multiply-accumulate
+;; We define each insn twice here:
+;; 1: with flag_unsafe_math_optimizations for the widening multiply phase
+;; to be able to use when converting to FMA.
+;; 2: without flag_unsafe_math_optimizations for the intrinsics to use.
(define_insn "fma<VCVTF:mode>4"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
@@ -735,6 +739,19 @@
(const_string "neon_fp_vmla_qqq")))]
)
+(define_insn "fma<VCVTF:mode>4_intrinsic"
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+ (fma:VCVTF (match_operand:VCVTF 1 "register_operand" "w")
+ (match_operand:VCVTF 2 "register_operand" "w")
+ (match_operand:VCVTF 3 "register_operand" "0")))]
+ "TARGET_NEON && TARGET_FMA"
+ "vfma%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (match_test "<Is_d_reg>")
+ (const_string "neon_fp_vmla_ddd")
+ (const_string "neon_fp_vmla_qqq")))]
+)
+
(define_insn "*fmsub<VCVTF:mode>4"
[(set (match_operand:VCVTF 0 "register_operand" "=w")
(fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
@@ -748,6 +765,19 @@
(const_string "neon_fp_vmla_qqq")))]
)
+(define_insn "fmsub<VCVTF:mode>4_intrinsic"
+ [(set (match_operand:VCVTF 0 "register_operand" "=w")
+ (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w"))
+ (match_operand:VCVTF 2 "register_operand" "w")
+ (match_operand:VCVTF 3 "register_operand" "0")))]
+ "TARGET_NEON && TARGET_FMA"
+ "vfms%?.<V_if_elem>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (match_test "<Is_d_reg>")
+ (const_string "neon_fp_vmla_ddd")
+ (const_string "neon_fp_vmla_qqq")))]
+)
+
(define_insn "ior<mode>3"
[(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
(ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
@@ -1925,6 +1955,32 @@
DONE;
})
+(define_expand "neon_vfma<VCVTF:mode>"
+ [(match_operand:VCVTF 0 "s_register_operand")
+ (match_operand:VCVTF 1 "s_register_operand")
+ (match_operand:VCVTF 2 "s_register_operand")
+ (match_operand:VCVTF 3 "s_register_operand")
+ (match_operand:SI 4 "immediate_operand")]
+ "TARGET_NEON && TARGET_FMA"
+{
+ emit_insn (gen_fma<mode>4_intrinsic (operands[0], operands[2], operands[3],
+ operands[1]));
+ DONE;
+})
+
+(define_expand "neon_vfms<VCVTF:mode>"
+ [(match_operand:VCVTF 0 "s_register_operand")
+ (match_operand:VCVTF 1 "s_register_operand")
+ (match_operand:VCVTF 2 "s_register_operand")
+ (match_operand:VCVTF 3 "s_register_operand")
+ (match_operand:SI 4 "immediate_operand")]
+ "TARGET_NEON && TARGET_FMA"
+{
+ emit_insn (gen_fmsub<mode>4_intrinsic (operands[0], operands[2], operands[3],
+ operands[1]));
+ DONE;
+})
+
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmla<mode>_unspec"
@@ -102,6 +102,8 @@ type opcode =
| Vmul
| Vmla
| Vmls
+ | Vfma
+ | Vfms
| Vsub
| Vceq
| Vcge
@@ -275,6 +277,8 @@ type features =
| Const_valuator of (int -> int)
| Fixed_vector_reg
| Fixed_core_reg
+ (* Mark that the intrinsic requires __ARM_FEATURE_string to be defined. *)
+ | Requires_feature of string
exception MixedMode of elts * elts
@@ -802,6 +806,12 @@ let ops =
Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
+ (* Fused-multiply-accumulate. *)
+ Vfma, [Requires_feature "FMA"], All (3, Dreg), "vfma", elts_same_io, [F32];
+ Vfma, [Requires_feature "FMA"], All (3, Qreg), "vfmaQ", elts_same_io, [F32];
+ Vfms, [Requires_feature "FMA"], All (3, Dreg), "vfms", elts_same_io, [F32];
+ Vfms, [Requires_feature "FMA"], All (3, Qreg), "vfmsQ", elts_same_io, [F32];
+
(* Subtraction. *)
Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_32;
Vsub, [No_op], All (3, Dreg), "vsub", sign_invar_2, [S64; U64];
@@ -972,6 +972,38 @@
+@subsubsection Fused-multiply-accumulate
+
+@itemize @bullet
+@item float32x2_t vfma_f32 (float32x2_t, float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{d0}, @var{d0}, @var{d0}}
+@end itemize
+
+
+@itemize @bullet
+@item float32x4_t vfmaq_f32 (float32x4_t, float32x4_t, float32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vfma.f32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+
+
+@subsubsection Fused-multiply-subtract
+
+@itemize @bullet
+@item float32x2_t vfms_f32 (float32x2_t, float32x2_t, float32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{d0}, @var{d0}, @var{d0}}
+@end itemize
+
+
+@itemize @bullet
+@item float32x4_t vfmsq_f32 (float32x4_t, float32x4_t, float32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vfms.f32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+
+
@subsubsection Subtraction
@itemize @bullet
@@ -1497,24 +1529,6 @@
@subsubsection Comparison (greater-than-or-equal-to)
@itemize @bullet
-@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
-@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
-@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
-@end itemize
-
-
-@itemize @bullet
@item uint32x2_t vcge_s32 (int32x2_t, int32x2_t)
@*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@@ -1539,20 +1553,20 @@
@itemize @bullet
-@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcge_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcge_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcge_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@@ -1580,28 +1594,28 @@
@end itemize
-
-
-@subsubsection Comparison (less-than-or-equal-to)
-
@itemize @bullet
-@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcgeq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcgeq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcgeq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
@end itemize
+
+
+@subsubsection Comparison (less-than-or-equal-to)
+
@itemize @bullet
@item uint32x2_t vcle_s32 (int32x2_t, int32x2_t)
@*@emph{Form of expected instruction(s):} @code{vcge.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1627,20 +1641,20 @@
@itemize @bullet
-@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcle_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcle_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcle_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@@ -1668,28 +1682,28 @@
@end itemize
-
-
-@subsubsection Comparison (greater-than)
-
@itemize @bullet
-@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcleq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u32 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcleq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u16 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcleq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcge.u8 @var{q0}, @var{q0}, @var{q0}}
@end itemize
+
+
+@subsubsection Comparison (greater-than)
+
@itemize @bullet
@item uint32x2_t vcgt_s32 (int32x2_t, int32x2_t)
@*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1715,20 +1729,20 @@
@itemize @bullet
-@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vcgt_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vcgt_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vcgt_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@@ -1756,28 +1770,28 @@
@end itemize
-
-
-@subsubsection Comparison (less-than)
-
@itemize @bullet
-@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
+@item uint32x4_t vcgtq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
+@item uint16x8_t vcgtq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
@end itemize
@itemize @bullet
-@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
+@item uint8x16_t vcgtq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
@end itemize
+
+
+@subsubsection Comparison (less-than)
+
@itemize @bullet
@item uint32x2_t vclt_s32 (int32x2_t, int32x2_t)
@*@emph{Form of expected instruction(s):} @code{vcgt.s32 @var{d0}, @var{d0}, @var{d0}}
@@ -1803,20 +1817,20 @@
@itemize @bullet
-@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@item uint32x2_t vclt_u32 (uint32x2_t, uint32x2_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@item uint16x4_t vclt_u16 (uint16x4_t, uint16x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@itemize @bullet
-@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t)
-@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@item uint8x8_t vclt_u8 (uint8x8_t, uint8x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{d0}, @var{d0}, @var{d0}}
@end itemize
@@ -1844,6 +1858,24 @@
@end itemize
+@itemize @bullet
+@item uint32x4_t vcltq_u32 (uint32x4_t, uint32x4_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u32 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+@itemize @bullet
+@item uint16x8_t vcltq_u16 (uint16x8_t, uint16x8_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u16 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
+@itemize @bullet
+@item uint8x16_t vcltq_u8 (uint8x16_t, uint8x16_t)
+@*@emph{Form of expected instruction(s):} @code{vcgt.u8 @var{q0}, @var{q0}, @var{q0}}
+@end itemize
+
+
@subsubsection Comparison (absolute greater-than-or-equal-to)
@@ -4810,13 +4842,13 @@
@itemize @bullet
@item uint64_t vgetq_lane_u64 (uint64x2_t, const int)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}}
+@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}}
@end itemize
@itemize @bullet
@item int64_t vgetq_lane_s64 (int64x2_t, const int)
-@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}}
+@*@emph{Form of expected instruction(s):} @code{vmov @var{r0}, @var{r0}, @var{d0}} @emph{or} @code{fmrrd @var{r0}, @var{r0}, @var{d0}}
@end itemize
new file mode 100644
@@ -0,0 +1,22 @@
+/* Test the `vfmaQf32' ARM Neon intrinsic. */
+/* This file was autogenerated by neon-testgen. */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaQf32 (void)
+{
+ float32x4_t out_float32x4_t;
+ float32x4_t arg0_float32x4_t;
+ float32x4_t arg1_float32x4_t;
+ float32x4_t arg2_float32x4_t;
+
+ out_float32x4_t = vfmaq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[ \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,22 @@
+/* Test the `vfmaf32' ARM Neon intrinsic. */
+/* This file was autogenerated by neon-testgen. */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmaf32 (void)
+{
+ float32x2_t out_float32x2_t;
+ float32x2_t arg0_float32x2_t;
+ float32x2_t arg1_float32x2_t;
+ float32x2_t arg2_float32x2_t;
+
+ out_float32x2_t = vfma_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfma\.f32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,22 @@
+/* Test the `vfmsQf32' ARM Neon intrinsic. */
+/* This file was autogenerated by neon-testgen. */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsQf32 (void)
+{
+ float32x4_t out_float32x4_t;
+ float32x4_t arg0_float32x4_t;
+ float32x4_t arg1_float32x4_t;
+ float32x4_t arg2_float32x4_t;
+
+ out_float32x4_t = vfmsq_f32 (arg0_float32x4_t, arg1_float32x4_t, arg2_float32x4_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[ \]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
new file mode 100644
@@ -0,0 +1,22 @@
+/* Test the `vfmsf32' ARM Neon intrinsic. */
+/* This file was autogenerated by neon-testgen. */
+
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_neonv2_ok } */
+/* { dg-options "-save-temps -O0" } */
+/* { dg-add-options arm_neonv2 } */
+
+#include "arm_neon.h"
+
+void test_vfmsf32 (void)
+{
+ float32x2_t out_float32x2_t;
+ float32x2_t arg0_float32x2_t;
+ float32x2_t arg1_float32x2_t;
+ float32x2_t arg2_float32x2_t;
+
+ out_float32x2_t = vfms_f32 (arg0_float32x2_t, arg1_float32x2_t, arg2_float32x2_t);
+}
+
+/* { dg-final { scan-assembler "vfms\.f32\[ \]+\[dD\]\[0-9\]+, \[dD\]\[0-9\]+, \[dD\]\[0-9\]+!?\(\[ \]+@\[a-zA-Z0-9 \]+\)?\n" } } */
+/* { dg-final { cleanup-saved-temps } } */
--
1.7.4.1