@@ -177,7 +177,7 @@ extern void ix86_expand_trunc (rtx, rtx);
extern void ix86_expand_truncdf_32 (rtx, rtx);
#ifdef TREE_CODE
-extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree);
+extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
#endif /* TREE_CODE */
#endif /* RTX_CODE */
@@ -56,6 +56,293 @@ along with GCC; see the file COPYING3. If not see
#include "debug.h"
#include "dwarf2out.h"
#include "sched-int.h"
+
+typedef struct block_info_def
+{
+ /* TRUE if the upper 128bits of any AVX registers are live at exit. */
+ bool upper_128bits_set;
+ /* TRUE if block has been processed. */
+ bool done;
+} *block_info;
+
+#define BLOCK_INFO(B) ((block_info) (B)->aux)
+
+enum call_avx256_state
+{
+ /* Callee returns 256bit AVX register. */
+ callee_return_avx256 = -1,
+ /* Callee returns and passes 256bit AVX register. */
+ callee_return_pass_avx256,
+ /* Callee passes 256bit AVX register. */
+ callee_pass_avx256,
+ /* Callee doesn't return nor passe 256bit AVX register, or no
+ 256bit AVX register in function return. */
+ call_no_avx256,
+ /* vzeroupper intrinsic. */
+ vzeroupper_intrinsic
+};
+
+/* Check if a 256bit AVX register is referenced in stores. */
+
+static void
+check_avx256_stores (rtx dest, const_rtx set, void *data)
+{
+ if ((REG_P (dest)
+ && VALID_AVX256_REG_MODE (GET_MODE (dest)))
+ || (GET_CODE (set) == SET
+ && REG_P (SET_SRC (set))
+ && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
+ {
+ bool *upper_128bits_set = (bool *) data;
+ *upper_128bits_set = true;
+ }
+}
+
+/* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
+ in basic block BB. Delete it if upper 128bit AVX registers are
+ unused. If it isn't deleted, move it to just before a jump insn.
+
+ UPPER_128BITS_LIVE is TRUE if the upper 128bits of any AVX registers
+ are live at entry. */
+
+static void
+move_or_delete_vzeroupper_2 (basic_block bb, bool upper_128bits_set)
+{
+ rtx curr_insn, next_insn, prev_insn, insn;
+
+ if (dump_file)
+ fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n",
+ bb->index, upper_128bits_set);
+
+ for (curr_insn = BB_HEAD (bb);
+ curr_insn && curr_insn != NEXT_INSN (BB_END (bb));
+ curr_insn = next_insn)
+ {
+ int avx256;
+
+ next_insn = NEXT_INSN (curr_insn);
+
+ if (!NONDEBUG_INSN_P (curr_insn))
+ continue;
+
+ /* Search for vzeroupper. */
+ insn = PATTERN (curr_insn);
+ if (GET_CODE (insn) == UNSPEC_VOLATILE
+ && XINT (insn, 1) == UNSPECV_VZEROUPPER)
+ {
+ /* Found vzeroupper. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Found vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ }
+ else
+ {
+ /* Check vzeroall intrinsic. */
+ if (GET_CODE (insn) == PARALLEL
+ && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE
+ && XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL)
+ upper_128bits_set = false;
+ else if (!upper_128bits_set)
+ {
+ /* Check if upper 128bits of AVX registers are used. */
+ note_stores (insn, check_avx256_stores,
+ &upper_128bits_set);
+ }
+ continue;
+ }
+
+ avx256 = INTVAL (XVECEXP (insn, 0, 0));
+
+ if (!upper_128bits_set)
+ {
+ /* Since the upper 128bits are cleared, callee must not pass
+ 256bit AVX register. We only need to check if callee
+ returns 256bit AVX register. */
+ upper_128bits_set = avx256 == callee_return_avx256;
+
+ /* Remove unnecessary vzeroupper since upper 128bits are
+ cleared. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Delete redundant vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ continue;
+ }
+ else if (avx256 == callee_return_pass_avx256
+ || avx256 == callee_pass_avx256)
+ {
+ /* Callee passes 256bit AVX register. Check if callee
+ returns 256bit AVX register. */
+ upper_128bits_set = avx256 == callee_return_pass_avx256;
+
+ /* Must remove vzeroupper since callee passes 256bit AVX
+ register. */
+ if (dump_file)
+ {
+ fprintf (dump_file, "Delete callee pass vzeroupper:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ continue;
+ }
+
+ /* Find the jump after vzeroupper. */
+ prev_insn = curr_insn;
+ if (avx256 == vzeroupper_intrinsic)
+ {
+ /* For vzeroupper intrinsic, check if there is another
+ vzeroupper. */
+ insn = NEXT_INSN (curr_insn);
+ while (insn)
+ {
+ if (NONJUMP_INSN_P (insn)
+ && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
+ && XINT (PATTERN (insn), 1) == UNSPECV_VZEROUPPER)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file,
+ "Delete redundant vzeroupper intrinsic:\n");
+ print_rtl_single (dump_file, curr_insn);
+ }
+ delete_insn (curr_insn);
+ insn = NULL;
+ continue;
+ }
+
+ if (JUMP_P (insn) || CALL_P (insn))
+ break;
+ prev_insn = insn;
+ insn = NEXT_INSN (insn);
+ if (insn == NEXT_INSN (BB_END (bb)))
+ break;
+ }
+
+ /* Continue if redundant vzeroupper intrinsic is deleted. */
+ if (!insn)
+ continue;
+ }
+ else
+ {
+ /* Find the next jump/call. */
+ insn = NEXT_INSN (curr_insn);
+ while (insn)
+ {
+ if (JUMP_P (insn) || CALL_P (insn))
+ break;
+ prev_insn = insn;
+ insn = NEXT_INSN (insn);
+ if (insn == NEXT_INSN (BB_END (bb)))
+ break;
+ }
+
+ if (!insn)
+ gcc_unreachable();
+ }
+
+ /* Keep vzeroupper. */
+ upper_128bits_set = false;
+
+ /* Also allow label as the next instruction. */
+ if (insn == NEXT_INSN (BB_END (bb)) && !LABEL_P (insn))
+ gcc_unreachable();
+
+ /* Move vzeroupper before jump/call if neeeded. */
+ if (curr_insn != prev_insn)
+ {
+ reorder_insns_nobb (curr_insn, curr_insn, prev_insn);
+ if (dump_file)
+ {
+ fprintf (dump_file, "Move vzeroupper after:\n");
+ print_rtl_single (dump_file, prev_insn);
+ fprintf (dump_file, "before:\n");
+ print_rtl_single (dump_file, insn);
+ }
+ }
+
+ next_insn = NEXT_INSN (insn);
+ }
+
+ BLOCK_INFO (bb)->upper_128bits_set = upper_128bits_set;
+
+ if (dump_file)
+ fprintf (dump_file, " BB [%i] exit: upper 128bits: %d\n",
+ bb->index, upper_128bits_set);
+}
+
+/* Helper function for move_or_delete_vzeroupper. Process vzeroupper
+ in BLOCK and its predecessor blocks recursively. */
+
+static void
+move_or_delete_vzeroupper_1 (basic_block block)
+{
+ edge e;
+ edge_iterator ei;
+ bool upper_128bits_set;
+
+ if (dump_file)
+ fprintf (dump_file, " Process BB [%i]: status: %d\n",
+ block->index, BLOCK_INFO (block)->done);
+
+ if (BLOCK_INFO (block)->done)
+ return;
+
+ BLOCK_INFO (block)->done = true;
+
+ upper_128bits_set = false;
+
+ /* Process all predecessor edges of this block. */
+ FOR_EACH_EDGE (e, ei, block->preds)
+ {
+ if (e->src == block)
+ continue;
+ move_or_delete_vzeroupper_1 (e->src);
+ if (BLOCK_INFO (e->src)->upper_128bits_set)
+ upper_128bits_set = true;
+ }
+
+ /* Process this block. */
+ move_or_delete_vzeroupper_2 (block, upper_128bits_set);
+}
+
+/* Go through the instruction stream looking for vzeroupper. Delete
+ it if upper 128bit AVX registers are unused. If it isn't deleted,
+ move it to just before a jump insn. */
+
+static void
+move_or_delete_vzeroupper (void)
+{
+ edge e;
+ edge_iterator ei;
+
+ /* Set up block info for each basic block. */
+ alloc_aux_for_blocks (sizeof (struct block_info_def));
+
+ /* Process successor blocks of all entry points. */
+ if (dump_file)
+ fprintf (dump_file, "Process all entry points\n");
+
+ FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
+ {
+ move_or_delete_vzeroupper_2 (e->dest,
+ cfun->machine->caller_pass_avx256_p);
+ BLOCK_INFO (e->dest)->done = true;
+ }
+
+ /* Process predecessor blocks of all exit points. */
+ if (dump_file)
+ fprintf (dump_file, "Process all exit points\n");
+
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
+ move_or_delete_vzeroupper_1 (e->src);
+
+ free_aux_for_blocks ();
+}
+
static rtx legitimize_dllimport_symbol (rtx, bool);
#ifndef CHECK_STACK_LIMIT
@@ -2633,6 +2920,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune,
{ "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
{ "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
{ "-m8bit-idiv", MASK_USE_8BIT_IDIV },
+ { "-mvzeroupper", MASK_VZEROUPPER },
};
const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
@@ -3712,6 +4000,60 @@ ix86_option_override_internal (bool main_args_p)
if (main_args_p)
target_option_default_node = target_option_current_node
= build_target_option_node ();
+
+ if (TARGET_AVX)
+ {
+ /* Enable vzeroupper pass by default for TARGET_AVX. */
+ if (!(target_flags_explicit & MASK_VZEROUPPER))
+ target_flags |= MASK_VZEROUPPER;
+ }
+ else
+ {
+ /* Disable vzeroupper pass if TARGET_AVX is disabled. */
+ target_flags &= ~MASK_VZEROUPPER;
+ }
+}
+
+/* Return TRUE if type TYPE and mode MODE use 256bit AVX modes. */
+
+static bool
+use_avx256_p (enum machine_mode mode, const_tree type)
+{
+ return (VALID_AVX256_REG_MODE (mode)
+ || (type
+ && TREE_CODE (type) == VECTOR_TYPE
+ && int_size_in_bytes (type) == 32));
+}
+
+/* Return TRUE if VAL is passed in register with 256bit AVX modes. */
+
+static bool
+function_pass_avx256_p (const_rtx val)
+{
+ if (!val)
+ return false;
+
+ if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
+ return true;
+
+ if (GET_CODE (val) == PARALLEL)
+ {
+ int i;
+ rtx r;
+
+ for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
+ {
+ r = XVECEXP (val, 0, i);
+ if (GET_CODE (r) == EXPR_LIST
+ && XEXP (r, 0)
+ && REG_P (XEXP (r, 0))
+ && (GET_MODE (XEXP (r, 0)) == OImode
+ || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
+ return true;
+ }
+ }
+
+ return false;
}
/* Implement the TARGET_OPTION_OVERRIDE hook. */
@@ -4626,7 +4968,14 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
return false;
}
else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
- ;
+ {
+ /* Disable sibcall if we need to generate vzeroupper after
+ callee returns. */
+ if (TARGET_VZEROUPPER
+ && cfun->machine->callee_return_avx256_p
+ && !cfun->machine->caller_return_avx256_p)
+ return false;
+ }
else if (!rtx_equal_p (a, b))
return false;
@@ -5243,15 +5592,54 @@ void
init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
tree fntype, /* tree ptr for function decl */
rtx libname, /* SYMBOL_REF of library name or 0 */
- tree fndecl)
+ tree fndecl,
+ int caller)
{
- struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
+ struct cgraph_local_info *i;
+ tree fnret_type;
+
memset (cum, 0, sizeof (*cum));
+ /* Initialize for the current callee. */
+ if (caller)
+ {
+ cfun->machine->callee_pass_avx256_p = false;
+ cfun->machine->callee_return_avx256_p = false;
+ }
+
if (fndecl)
- cum->call_abi = ix86_function_abi (fndecl);
+ {
+ i = cgraph_local_info (fndecl);
+ cum->call_abi = ix86_function_abi (fndecl);
+ fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
+ }
else
- cum->call_abi = ix86_function_type_abi (fntype);
+ {
+ i = NULL;
+ cum->call_abi = ix86_function_type_abi (fntype);
+ if (fntype)
+ fnret_type = TREE_TYPE (fntype);
+ else
+ fnret_type = NULL;
+ }
+
+ if (TARGET_VZEROUPPER && fnret_type)
+ {
+ rtx fnret_value = ix86_function_value (fnret_type, fntype,
+ false);
+ if (function_pass_avx256_p (fnret_value))
+ {
+ /* The return value of this function uses 256bit AVX modes. */
+ cfun->machine->use_avx256_p = true;
+ if (caller)
+ cfun->machine->callee_return_avx256_p = true;
+ else
+ cfun->machine->caller_return_avx256_p = true;
+ }
+ }
+
+ cum->caller = caller;
+
/* Set up the number of registers to use for passing arguments. */
if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
@@ -6488,6 +6876,7 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
{
enum machine_mode mode = omode;
HOST_WIDE_INT bytes, words;
+ rtx arg;
if (mode == BLKmode)
bytes = int_size_in_bytes (type);
@@ -6501,11 +6890,23 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
mode = type_natural_mode (type, cum);
if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
- return function_arg_ms_64 (cum, mode, omode, named, bytes);
+ arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
else if (TARGET_64BIT)
- return function_arg_64 (cum, mode, omode, type, named);
+ arg = function_arg_64 (cum, mode, omode, type, named);
else
- return function_arg_32 (cum, mode, omode, type, bytes, words);
+ arg = function_arg_32 (cum, mode, omode, type, bytes, words);
+
+ if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
+ {
+ /* This argument uses 256bit AVX modes. */
+ cfun->machine->use_avx256_p = true;
+ if (cum->caller)
+ cfun->machine->callee_pass_avx256_p = true;
+ else
+ cfun->machine->caller_pass_avx256_p = true;
+ }
+
+ return arg;
}
/* A C expression that indicates when an argument must be passed by
@@ -10353,6 +10754,15 @@ ix86_expand_epilogue (int style)
return;
}
+ /* Emit vzeroupper if needed. */
+ if (TARGET_VZEROUPPER
+ && cfun->machine->use_avx256_p
+ && !cfun->machine->caller_return_avx256_p)
+ {
+ cfun->machine->use_vzeroupper_p = 1;
+ emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
+ }
+
if (crtl->args.pops_args && crtl->args.size)
{
rtx popc = GEN_INT (crtl->args.pops_args);
@@ -20910,6 +21320,25 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
+ 2, vec));
}
+ /* Emit vzeroupper if needed. */
+ if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
+ {
+ rtx avx256;
+ cfun->machine->use_vzeroupper_p = 1;
+ if (cfun->machine->callee_pass_avx256_p)
+ {
+ if (cfun->machine->callee_return_avx256_p)
+ avx256 = GEN_INT (callee_return_pass_avx256);
+ else
+ avx256 = GEN_INT (callee_pass_avx256);
+ }
+ else if (cfun->machine->callee_return_avx256_p)
+ avx256 = GEN_INT (callee_return_avx256);
+ else
+ avx256 = GEN_INT (call_no_avx256);
+ emit_insn (gen_avx_vzeroupper (avx256));
+ }
+
call = emit_call_insn (call);
if (use)
CALL_INSN_FUNCTION_USAGE (call) = use;
@@ -21653,6 +22082,9 @@ ix86_local_alignment (tree exp, enum machine_mode mode,
decl = NULL;
}
+ if (use_avx256_p (mode, type))
+ cfun->machine->use_avx256_p = true;
+
/* Don't do dynamic stack realignment for long long objects with
-mpreferred-stack-boundary=2. */
if (!TARGET_64BIT
@@ -21748,9 +22180,6 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
{
tree type, decl;
- if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
- return align;
-
if (exp && DECL_P (exp))
{
type = TREE_TYPE (exp);
@@ -21762,6 +22191,12 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
decl = NULL;
}
+ if (use_avx256_p (mode, type))
+ cfun->machine->use_avx256_p = true;
+
+ if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+ return align;
+
/* Don't do dynamic stack realignment for long long objects with
-mpreferred-stack-boundary=2. */
if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
@@ -25505,6 +25940,8 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
switch ((enum ix86_builtin_func_type) d->flag)
{
case VOID_FTYPE_VOID:
+ if (icode == CODE_FOR_avx_vzeroupper)
+ target = GEN_INT (vzeroupper_intrinsic);
emit_insn (GEN_FCN (icode) (target));
return 0;
case VOID_FTYPE_UINT64:
@@ -28542,6 +28979,10 @@ ix86_reorg (void)
ix86_avoid_jump_mispredicts ();
#endif
}
+
+ /* Run the vzeroupper optimization if needed. */
+ if (cfun->machine->use_vzeroupper_p)
+ move_or_delete_vzeroupper ();
}
/* Return nonzero when QImode register that must be represented via REX prefix
@@ -1507,6 +1507,7 @@ typedef struct ix86_args {
int mmx_nregs; /* # mmx registers available for passing */
int mmx_regno; /* next available mmx register number */
int maybe_vaarg; /* true for calls to possibly vardic fncts. */
+ int caller; /* true if it is caller. */
int float_in_sse; /* Set to 1 or 2 for 32bit targets if
SFmode/DFmode arguments should be passed
in SSE registers. Otherwise 0. */
@@ -1519,7 +1520,8 @@ typedef struct ix86_args {
For a library call, FNTYPE is 0. */
#define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \
- init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL))
+ init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \
+ (N_NAMED_ARGS) != -1)
/* Output assembler code to FILE to increment profiler label # LABELNO
for profiling a function entry. */
@@ -2289,6 +2291,24 @@ struct GTY(()) machine_function {
stack below the return address. */
BOOL_BITFIELD static_chain_on_stack : 1;
+ /* Nonzero if the current function uses vzeroupper. */
+ BOOL_BITFIELD use_vzeroupper_p : 1;
+
+ /* Nonzero if the current function uses 256bit AVX regisers. */
+ BOOL_BITFIELD use_avx256_p : 1;
+
+ /* Nonzero if caller passes 256bit AVX modes. */
+ BOOL_BITFIELD caller_pass_avx256_p : 1;
+
+ /* Nonzero if caller returns 256bit AVX modes. */
+ BOOL_BITFIELD caller_return_avx256_p : 1;
+
+ /* Nonzero if the current callee passes 256bit AVX modes. */
+ BOOL_BITFIELD callee_pass_avx256_p : 1;
+
+ /* Nonzero if the current callee returns 256bit AVX modes. */
+ BOOL_BITFIELD callee_return_avx256_p : 1;
+
/* During prologue/epilogue generation, the current frame state.
Otherwise, the frame state at the end of the prologue. */
struct machine_frame_state fs;
@@ -256,6 +256,11 @@ mcld
Target Report Mask(CLD) Save
Generate cld instruction in the function prologue.
+mvzeroupper
+Target Report Mask(VZEROUPPER) Save
+Generate vzeroupper instruction before a transfer of control flow out of
+the function.
+
mfused-madd
Target Report Mask(FUSED_MADD) Save
Enable automatic generation of fused floating point multiply-add instructions
@@ -11508,29 +11508,11 @@
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
-;; vzeroupper clobbers the upper 128bits of AVX registers.
-(define_expand "avx_vzeroupper"
- [(match_par_dup 0 [(const_int 0)])]
- "TARGET_AVX"
-{
- int nregs = TARGET_64BIT ? 16 : 8;
- int regno;
-
- operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
-
- XVECEXP (operands[0], 0, 0)
- = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
- UNSPECV_VZEROUPPER);
-
- for (regno = 0; regno < nregs; regno++)
- XVECEXP (operands[0], 0, regno + 1)
- = gen_rtx_CLOBBER (VOIDmode,
- gen_rtx_REG (V8SImode, SSE_REGNO (regno)));
-})
-
-(define_insn "*avx_vzeroupper"
- [(match_parallel 0 "vzeroupper_operation"
- [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])]
+;; Clear the upper 128bits of AVX registers, equivalent to a NOP
+;; if the upper 128bits are unused.
+(define_insn "avx_vzeroupper"
+ [(unspec_volatile [(match_operand 0 "const_int_operand" "")]
+ UNSPECV_VZEROUPPER)]
"TARGET_AVX"
"vzeroupper"
[(set_attr "type" "sse")
@@ -594,7 +594,7 @@ Objective-C and Objective-C++ Dialects}.
-mno-wide-multiply -mrtd -malign-double @gol
-mpreferred-stack-boundary=@var{num}
-mincoming-stack-boundary=@var{num} @gol
--mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol
+-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip -mvzeroupper @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol
-maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfused-madd @gol
-msse4a -m3dnow -mpopcnt -mabm -mfma4 -mxop -mlwp @gol
@@ -12466,6 +12466,13 @@ GCC with the @option{--enable-cld} configure option. Generation of @code{cld}
instructions can be suppressed with the @option{-mno-cld} compiler option
in this case.
+@item -mvzeroupper
+@opindex mvzeroupper
+This option instructs GCC to emit a @code{vzeroupper} instruction
+before a transfer of control flow out of the function to minimize
+AVX to SSE transition penalty as well as remove unnecessary zeroupper
+intrinsics.
+
@item -mcx16
@opindex mcx16
This option will enable GCC to use CMPXCHG16B instruction in generated code.
@@ -1,6 +1,6 @@
/* { dg-do run } */
/* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
#include "avx-check.h"
new file mode 100644
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */
new file mode 100644
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern float x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */
new file mode 100644
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
+/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mno-vzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
@@ -1,6 +1,6 @@
/* { dg-do run } */
/* { dg-require-effective-target avx } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-O2 -mavx -mtune=generic" } */
#include "avx-check.h"
new file mode 100644
@@ -0,0 +1,34 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O2 -mavx -mvzeroupper" } */
+
+#include "avx-check.h"
+
+int s[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+int d[8] = {11, 22, 33, 44, 55, 66, 77, 88};
+
+void
+__attribute__((noinline))
+foo ()
+{
+ int i;
+ for (i = 0; i < ARRAY_SIZE (d); i++)
+ d[i] = s[i] + 0x1000;
+}
+
+static void
+__attribute__((noinline))
+bar (__m256i src)
+{
+ foo ();
+ _mm256_storeu_si256 ((__m256i*) d, src);
+ if (__builtin_memcmp (d, s, sizeof (d)))
+ abort ();
+}
+
+static void
+avx_test (void)
+{
+ __m256i src = _mm256_loadu_si256 ((__m256i*) s);
+ bar (src);
+}
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__));
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+ bar2 (y);
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern void bar2 (__m256);
+extern __m256 y;
+
+void
+foo ()
+{
+ bar2 (y);
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
new file mode 100644
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ x = y;
+ _mm256_zeroall ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */
new file mode 100644
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */
+
+#include <immintrin.h>
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+ _mm256_zeroupper ();
+ x = y;
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+ _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */