===================================================================
@@ -3948,6 +3948,16 @@ means of constraints requiring operands
@itemx @samp{and@var{m}3}, @samp{ior@var{m}3}, @samp{xor@var{m}3}
Similar, for other arithmetic operations.
+@cindex @code{fma@var{m}4} instruction pattern
+@item @samp{fma@var{m}4}
+Multiply operand 2 and operand 1, then add operand 3, storing the
+result in operand 0. All operands must have mode @var{m}. This
+pattern is used to implement the @code{fma}, @code{fmas}, and
+@code{fmal} builtin functions from the ISO C99 standard. The
+@code{fma} operation may produce different results than doing the
+multiply followed by the add if the machine does not perform a
+rounding step between the operations.
+
@cindex @code{min@var{m}3} instruction pattern
@cindex @code{max@var{m}3} instruction pattern
@item @samp{smin@var{m}3}, @samp{smax@var{m}3}
===================================================================
@@ -190,6 +190,8 @@ enum optab_index
OTI_pow,
/* Arc tangent of y/x */
OTI_atan2,
+ /* Floating multiply/add */
+ OTI_fma,
/* Move instruction. */
OTI_mov,
@@ -432,6 +434,7 @@ enum optab_index
#define umax_optab (&optab_table[OTI_umax])
#define pow_optab (&optab_table[OTI_pow])
#define atan2_optab (&optab_table[OTI_atan2])
+#define fma_optab (&optab_table[OTI_fma])
#define mov_optab (&optab_table[OTI_mov])
#define movstrict_optab (&optab_table[OTI_movstrict])
===================================================================
@@ -159,6 +159,7 @@ static const char * const optabs[] =
"set_optab_handler (sqrt_optab, $A, CODE_FOR_$(sqrt$a2$))",
"set_optab_handler (floor_optab, $A, CODE_FOR_$(floor$a2$))",
"set_convert_optab_handler (lfloor_optab, $B, $A, CODE_FOR_$(lfloor$F$a$I$b2$))",
+ "set_optab_handler (fma_optab, $A, CODE_FOR_$(fma$a4$))",
"set_optab_handler (ceil_optab, $A, CODE_FOR_$(ceil$a2$))",
"set_convert_optab_handler (lceil_optab, $B, $A, CODE_FOR_$(lceil$F$a$I$b2$))",
"set_optab_handler (round_optab, $A, CODE_FOR_$(round$a2$))",
===================================================================
@@ -106,6 +106,7 @@ static void expand_errno_check (tree, rt
static rtx expand_builtin_mathfn (tree, rtx, rtx);
static rtx expand_builtin_mathfn_2 (tree, rtx, rtx);
static rtx expand_builtin_mathfn_3 (tree, rtx, rtx);
+static rtx expand_builtin_mathfn_ternary (tree, rtx, rtx);
static rtx expand_builtin_interclass_mathfn (tree, rtx);
static rtx expand_builtin_sincos (tree);
static rtx expand_builtin_cexpi (tree, rtx);
@@ -2185,6 +2186,79 @@ expand_builtin_mathfn_2 (tree exp, rtx t
return target;
}
+/* Expand a call to the builtin trinary math functions (fma).
+ Return NULL_RTX if a normal call should be emitted rather than expanding the
+ function in-line. EXP is the expression that is a call to the builtin
+ function; if convenient, the result should be placed in TARGET.
+ SUBTARGET may be used as the target for computing one of EXP's
+ operands. */
+
+static rtx
+expand_builtin_mathfn_ternary (tree exp, rtx target, rtx subtarget)
+{
+ optab builtin_optab;
+ rtx op0, op1, op2, insns;
+ tree fndecl = get_callee_fndecl (exp);
+ tree arg0, arg1, arg2;
+ enum machine_mode mode;
+
+ if (!validate_arglist (exp, REAL_TYPE, REAL_TYPE, REAL_TYPE, VOID_TYPE))
+ return NULL_RTX;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+
+ switch (DECL_FUNCTION_CODE (fndecl))
+ {
+ CASE_FLT_FN (BUILT_IN_FMA):
+ builtin_optab = fma_optab; break;
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Make a suitable register to place result in. */
+ mode = TYPE_MODE (TREE_TYPE (exp));
+
+ /* Before working hard, check whether the instruction is available. */
+ if (optab_handler (builtin_optab, mode) == CODE_FOR_nothing)
+ return NULL_RTX;
+
+ target = gen_reg_rtx (mode);
+
+ /* Always stabilize the argument list. */
+ CALL_EXPR_ARG (exp, 0) = arg0 = builtin_save_expr (arg0);
+ CALL_EXPR_ARG (exp, 1) = arg1 = builtin_save_expr (arg1);
+ CALL_EXPR_ARG (exp, 2) = arg2 = builtin_save_expr (arg2);
+
+ op0 = expand_expr (arg0, subtarget, VOIDmode, EXPAND_NORMAL);
+ op1 = expand_normal (arg1);
+ op2 = expand_normal (arg2);
+
+ start_sequence ();
+
+ /* Compute into TARGET.
+ Set TARGET to wherever the result comes back. */
+ target = expand_ternary_op (mode, builtin_optab, op0, op1, op2,
+ target, 0);
+
+ /* If we were unable to expand via the builtin, stop the sequence
+ (without outputting the insns) and call to the library function
+ with the stabilized argument list. */
+ if (target == 0)
+ {
+ end_sequence ();
+ return expand_call (exp, target, target == const0_rtx);
+ }
+
+ /* Output the entire sequence. */
+ insns = get_insns ();
+ end_sequence ();
+ emit_insn (insns);
+
+ return target;
+}
+
/* Expand a call to the builtin sin and cos math functions.
Return NULL_RTX if a normal call should be emitted rather than expanding the
function in-line. EXP is the expression that is a call to the builtin
@@ -5829,6 +5903,12 @@ expand_builtin (tree exp, rtx target, rt
return target;
break;
+ CASE_FLT_FN (BUILT_IN_FMA):
+ target = expand_builtin_mathfn_ternary (exp, target, subtarget);
+ if (target)
+ return target;
+ break;
+
CASE_FLT_FN (BUILT_IN_ILOGB):
if (! flag_unsafe_math_optimizations)
break;
===================================================================
@@ -0,0 +1,84 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "xvmadd" 2 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+
+/* Only the functions calling the bulitin should generate an appropriate (a *
+ b) + c instruction. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]);
+}
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = (vdb[i] * vdc[i]) + vdd[i];
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
===================================================================
@@ -0,0 +1,61 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
+/* { dg-final { scan-assembler-times "fmadd " 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+ since -mfused-madd is on by default. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
===================================================================
@@ -0,0 +1,61 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power6 -maltivec -ffast-math -mno-fused-madd" } */
+/* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
+/* { dg-final { scan-assembler-times "fmadd " 1 } } */
+/* { dg-final { scan-assembler-times "fmadds" 1 } } */
+
+/* Only the functions calling the builtin should generate an appropriate
+ (a * b) + c instruction. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
===================================================================
@@ -0,0 +1,84 @@
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -ftree-vectorize -mcpu=power7 -ffast-math" } */
+/* { dg-final { scan-assembler-times "xvmadd" 4 } } */
+/* { dg-final { scan-assembler-times "xsmadd" 2 } } */
+/* { dg-final { scan-assembler-times "fmadds" 2 } } */
+
+/* All functions should generate an appropriate (a * b) + c instruction
+ since -mfused-madd is on by default. */
+
+double
+builtin_fma (double b, double c, double d)
+{
+ return __builtin_fma (b, c, d);
+}
+
+float
+builtin_fmaf (float b, float c, float d)
+{
+ return __builtin_fmaf (b, c, d);
+}
+
+double
+normal_fma (double b, double c, double d)
+{
+ return (b * c) + d;
+}
+
+float
+normal_fmaf (float b, float c, float d)
+{
+ return (b * c) + d;
+}
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double vda[SIZE] __attribute__((__aligned__(32)));
+double vdb[SIZE] __attribute__((__aligned__(32)));
+double vdc[SIZE] __attribute__((__aligned__(32)));
+double vdd[SIZE] __attribute__((__aligned__(32)));
+
+float vfa[SIZE] __attribute__((__aligned__(32)));
+float vfb[SIZE] __attribute__((__aligned__(32)));
+float vfc[SIZE] __attribute__((__aligned__(32)));
+float vfd[SIZE] __attribute__((__aligned__(32)));
+
+void
+vector_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = __builtin_fma (vdb[i], vdc[i], vdd[i]);
+}
+
+void
+vector_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = __builtin_fmaf (vfb[i], vfc[i], vfd[i]);
+}
+
+void
+vnormal_fma (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vda[i] = (vdb[i] * vdc[i]) + vdd[i];
+}
+
+void
+vnormal_fmaf (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ vfa[i] = (vfb[i] * vfc[i]) + vfd[i];
+}
===================================================================
@@ -1357,10 +1357,10 @@ vectorizable_call (gimple stmt, gimple_s
vectype_in = NULL_TREE;
nargs = gimple_call_num_args (stmt);
- /* Bail out if the function has more than two arguments, we
- do not have interesting builtin functions to vectorize with
- more than two arguments. No arguments is also not good. */
- if (nargs == 0 || nargs > 2)
+ /* Bail out if the function has more than three arguments, we do not have
+ interesting builtin functions to vectorize with more than two arguments
+ except for fma. No arguments is also not good. */
+ if (nargs == 0 || nargs > 3)
return false;
for (i = 0; i < nargs; i++)
===================================================================
@@ -3895,6 +3895,22 @@ rs6000_builtin_vectorized_function (tree
if (VECTOR_UNIT_ALTIVEC_P (V4SFmode))
return rs6000_builtin_decls[ALTIVEC_BUILTIN_VRFIM];
break;
+ case BUILT_IN_FMA:
+ if (VECTOR_UNIT_VSX_P (V2DFmode)
+ && out_mode == DFmode && out_n == 2
+ && in_mode == DFmode && in_n == 2)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVMADDDP];
+ break;
+ case BUILT_IN_FMAF:
+ if (VECTOR_UNIT_VSX_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[VSX_BUILTIN_XVMADDSP];
+ else if (VECTOR_UNIT_ALTIVEC_P (V4SFmode)
+ && out_mode == SFmode && out_n == 4
+ && in_mode == SFmode && in_n == 4)
+ return rs6000_builtin_decls[ALTIVEC_BUILTIN_VMADDFP];
+ break;
case BUILT_IN_TRUNC:
if (VECTOR_UNIT_VSX_P (V2DFmode)
&& out_mode == DFmode && out_n == 2
===================================================================
@@ -110,6 +110,7 @@ (define_constants
(UNSPEC_LFIWAX 56)
(UNSPEC_LFIWZX 57)
(UNSPEC_FCTIWUZ 58)
+ (UNSPEC_FMA 59)
])
;;
@@ -5844,6 +5845,39 @@ (define_insn "fres"
"fres %0,%1"
[(set_attr "type" "fp")])
+(define_expand "fmasf4"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "")
+ (plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "")
+ (match_operand:SF 2 "gpc_reg_operand" ""))
+ (match_operand:SF 3 "gpc_reg_operand" "")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS
+ && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT)
+ || ! TARGET_POWERPC)"
+{
+ if (!TARGET_FUSED_MADD)
+ {
+ if (TARGET_POWERPC)
+ emit_insn (gen_fmasf4_powerpc (operands[0], operands[1], operands[2],
+ operands[3]));
+ else
+ emit_insn (gen_fmasf4_power (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ }
+})
+
+; Fma builtin that will work even if -mno-fused-madd is used.
+(define_insn "fmasf4_powerpc"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (match_operand:SF 3 "gpc_reg_operand" "f")]
+ UNSPEC_FMA))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_POWERPC"
+ "fmadds %0,%1,%2,%3"
+ [(set_attr "type" "fp")
+ (set_attr "fp_type" "fp_maddsub_s")])
+
(define_insn "*fmaddsf4_powerpc"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
@@ -5855,6 +5889,16 @@ (define_insn "*fmaddsf4_powerpc"
[(set_attr "type" "fp")
(set_attr "fp_type" "fp_maddsub_s")])
+(define_insn "fmasf4_power"
+ [(set (match_operand:SF 0 "gpc_reg_operand" "=f")
+ (unspec:SF [(match_operand:SF 1 "gpc_reg_operand" "f")
+ (match_operand:SF 2 "gpc_reg_operand" "f")
+ (match_operand:SF 3 "gpc_reg_operand" "f")]
+ UNSPEC_FMA))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && !TARGET_POWERPC"
+ "{fma|fmadd} %0,%1,%2,%3"
+ [(set_attr "type" "dmul")])
+
(define_insn "*fmaddsf4_power"
[(set (match_operand:SF 0 "gpc_reg_operand" "=f")
(plus:SF (mult:SF (match_operand:SF 1 "gpc_reg_operand" "%f")
@@ -6280,6 +6324,40 @@ (define_insn "*rsqrtdf_internal1"
"frsqrte %0,%1"
[(set_attr "type" "fp")])
+(define_expand "fmadf4"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "")
+ (plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "")
+ (match_operand:DF 2 "gpc_reg_operand" ""))
+ (match_operand:DF 3 "gpc_reg_operand" "")))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS
+ && ((TARGET_POWERPC && TARGET_SINGLE_FLOAT)
+ || ! TARGET_POWERPC)"
+{
+ if (!TARGET_FUSED_MADD)
+ {
+ if (VECTOR_UNIT_VSX_P (DFmode))
+ emit_insn (gen_vsx_fmadddf4_2 (operands[0], operands[1], operands[2],
+ operands[3]));
+ else
+ emit_insn (gen_fmadf4_fpr (operands[0], operands[1], operands[2],
+ operands[3]));
+ DONE;
+ }
+})
+
+; Fma builtin that will work even if -mno-fused-madd is used.
+(define_insn "fmadf4_fpr"
+ [(set (match_operand:DF 0 "gpc_reg_operand" "=d")
+ (unspec:DF [(match_operand:DF 1 "gpc_reg_operand" "d")
+ (match_operand:DF 2 "gpc_reg_operand" "d")
+ (match_operand:DF 3 "gpc_reg_operand" "d")]
+ UNSPEC_FMA))]
+ "TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_DOUBLE_FLOAT
+ && VECTOR_UNIT_NONE_P (DFmode)"
+ "{fma|fmadd} %0,%1,%2,%3"
+ [(set_attr "type" "dmul")
+ (set_attr "fp_type" "fp_maddsub_d")])
+
(define_insn "*fmadddf4_fpr"
[(set (match_operand:DF 0 "gpc_reg_operand" "=d")
(plus:DF (mult:DF (match_operand:DF 1 "gpc_reg_operand" "%d")