PATCH: Add vzeroupper optimization for AVX

Message ID	AANLkTimEeGpXvvH-jwge2RnMH1TbA-r7rei00hxhGrSu@mail.gmail.com
State	New
Headers	show Return-Path: <gcc-patches-return-276499-incoming=patchwork.ozlabs.org@gcc.gnu.org> MIME-Version: 1.0 In-Reply-To: <AANLkTinmdgwjXtjUwvA6xKS-Wmq+YKRMpYrSKV-sw0jQ@mail.gmail.com> References: <20101025085724.GA17893@intel.com> <20101025113816.GT18103@tyan-ft48-01.lab.bos.redhat.com> <AANLkTi=TxYwE4v38HXRy8dEJ6yMeGtZA04v6UOSSZCrp@mail.gmail.com> <AANLkTik6hUkB7u_kiFOGAZDqNpuTUj=2saTpnz3-yM7x@mail.gmail.com> <AANLkTiko88xA978DD9OZKAexb08B7Cr8UeLrs-svK-Fv@mail.gmail.com> <AANLkTin5Z1z9gd62-=D=c69LOWKkx51MGuVwg+TR1QEn@mail.gmail.com> <AANLkTinmdgwjXtjUwvA6xKS-Wmq+YKRMpYrSKV-sw0jQ@mail.gmail.com> Date: Wed, 27 Oct 2010 03:01:41 -0700 Message-ID: <AANLkTimEeGpXvvH-jwge2RnMH1TbA-r7rei00hxhGrSu@mail.gmail.com> Subject: Re: PATCH: Add vzeroupper optimization for AVX From: "H.J. Lu" <hjl.tools@gmail.com> To: Uros Bizjak <ubizjak@gmail.com> Cc: Jakub Jelinek <jakub@redhat.com>, gcc-patches@gcc.gnu.org, Richard Henderson <rth@redhat.com> Content-Type: multipart/mixed; boundary=001485f7cbdce677ca0493964f71 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 9c10103..02c2a90 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -177,7 +177,7 @@ extern void ix86_expand_trunc (rtx, rtx); extern void ix86_expand_truncdf_32 (rtx, rtx); #ifdef TREE_CODE -extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree); +extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); #endif /* TREE_CODE */ #endif /* RTX_CODE */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6f3a898..ae74a4c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -56,6 +56,293 @@ along with GCC; see the file COPYING3. If not see #include "debug.h" #include "dwarf2out.h" #include "sched-int.h" + +typedef struct block_info_def +{ + /* TRUE if the upper 128bits of any AVX registers are live at exit. */ + bool upper_128bits_set; + /* TRUE if block has been processed. */ + bool done; +} *block_info; + +#define BLOCK_INFO(B) ((block_info) (B)->aux) + +enum call_avx256_state +{ + /* Callee returns 256bit AVX register. */ + callee_return_avx256 = -1, + /* Callee returns and passes 256bit AVX register. */ + callee_return_pass_avx256, + /* Callee passes 256bit AVX register. */ + callee_pass_avx256, + /* Callee doesn't return nor passe 256bit AVX register, or no + 256bit AVX register in function return. */ + call_no_avx256, + /* vzeroupper intrinsic. */ + vzeroupper_intrinsic +}; + +/* Check if a 256bit AVX register is referenced in stores. */ + +static void +check_avx256_stores (rtx dest, const_rtx set, void *data) +{ + if ((REG_P (dest) + && VALID_AVX256_REG_MODE (GET_MODE (dest))) + || (GET_CODE (set) == SET + && REG_P (SET_SRC (set)) + && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set))))) + { + bool *upper_128bits_set = (bool *) data; + *upper_128bits_set = true; + } +} + +/* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper + in basic block BB. Delete it if upper 128bit AVX registers are + unused. If it isn't deleted, move it to just before a jump insn. + + UPPER_128BITS_LIVE is TRUE if the upper 128bits of any AVX registers + are live at entry. */ + +static void +move_or_delete_vzeroupper_2 (basic_block bb, bool upper_128bits_set) +{ + rtx curr_insn, next_insn, prev_insn, insn; + + if (dump_file) + fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n", + bb->index, upper_128bits_set); + + for (curr_insn = BB_HEAD (bb); + curr_insn && curr_insn != NEXT_INSN (BB_END (bb)); + curr_insn = next_insn) + { + int avx256; + + next_insn = NEXT_INSN (curr_insn); + + if (!NONDEBUG_INSN_P (curr_insn)) + continue; + + /* Search for vzeroupper. */ + insn = PATTERN (curr_insn); + if (GET_CODE (insn) == UNSPEC_VOLATILE + && XINT (insn, 1) == UNSPECV_VZEROUPPER) + { + /* Found vzeroupper. */ + if (dump_file) + { + fprintf (dump_file, "Found vzeroupper:\n"); + print_rtl_single (dump_file, curr_insn); + } + } + else + { + /* Check vzeroall intrinsic. */ + if (GET_CODE (insn) == PARALLEL + && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE + && XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL) + upper_128bits_set = false; + else if (!upper_128bits_set) + { + /* Check if upper 128bits of AVX registers are used. */ + note_stores (insn, check_avx256_stores, + &upper_128bits_set); + } + continue; + } + + avx256 = INTVAL (XVECEXP (insn, 0, 0)); + + if (!upper_128bits_set) + { + /* Since the upper 128bits are cleared, callee must not pass + 256bit AVX register. We only need to check if callee + returns 256bit AVX register. */ + upper_128bits_set = avx256 == callee_return_avx256; + + /* Remove unnecessary vzeroupper since upper 128bits are + cleared. */ + if (dump_file) + { + fprintf (dump_file, "Delete redundant vzeroupper:\n"); + print_rtl_single (dump_file, curr_insn); + } + delete_insn (curr_insn); + continue; + } + else if (avx256 == callee_return_pass_avx256 + || avx256 == callee_pass_avx256) + { + /* Callee passes 256bit AVX register. Check if callee + returns 256bit AVX register. */ + upper_128bits_set = avx256 == callee_return_pass_avx256; + + /* Must remove vzeroupper since callee passes 256bit AVX + register. */ + if (dump_file) + { + fprintf (dump_file, "Delete callee pass vzeroupper:\n"); + print_rtl_single (dump_file, curr_insn); + } + delete_insn (curr_insn); + continue; + } + + /* Find the jump after vzeroupper. */ + prev_insn = curr_insn; + if (avx256 == vzeroupper_intrinsic) + { + /* For vzeroupper intrinsic, check if there is another + vzeroupper. */ + insn = NEXT_INSN (curr_insn); + while (insn) + { + if (NONJUMP_INSN_P (insn) + && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE + && XINT (PATTERN (insn), 1) == UNSPECV_VZEROUPPER) + { + if (dump_file) + { + fprintf (dump_file, + "Delete redundant vzeroupper intrinsic:\n"); + print_rtl_single (dump_file, curr_insn); + } + delete_insn (curr_insn); + insn = NULL; + continue; + } + + if (JUMP_P (insn) || CALL_P (insn)) + break; + prev_insn = insn; + insn = NEXT_INSN (insn); + if (insn == NEXT_INSN (BB_END (bb))) + break; + } + + /* Continue if redundant vzeroupper intrinsic is deleted. */ + if (!insn) + continue; + } + else + { + /* Find the next jump/call. */ + insn = NEXT_INSN (curr_insn); + while (insn) + { + if (JUMP_P (insn) || CALL_P (insn)) + break; + prev_insn = insn; + insn = NEXT_INSN (insn); + if (insn == NEXT_INSN (BB_END (bb))) + break; + } + + if (!insn) + gcc_unreachable(); + } + + /* Keep vzeroupper. */ + upper_128bits_set = false; + + /* Also allow label as the next instruction. */ + if (insn == NEXT_INSN (BB_END (bb)) && !LABEL_P (insn)) + gcc_unreachable(); + + /* Move vzeroupper before jump/call if neeeded. */ + if (curr_insn != prev_insn) + { + reorder_insns_nobb (curr_insn, curr_insn, prev_insn); + if (dump_file) + { + fprintf (dump_file, "Move vzeroupper after:\n"); + print_rtl_single (dump_file, prev_insn); + fprintf (dump_file, "before:\n"); + print_rtl_single (dump_file, insn); + } + } + + next_insn = NEXT_INSN (insn); + } + + BLOCK_INFO (bb)->upper_128bits_set = upper_128bits_set; + + if (dump_file) + fprintf (dump_file, " BB [%i] exit: upper 128bits: %d\n", + bb->index, upper_128bits_set); +} + +/* Helper function for move_or_delete_vzeroupper. Process vzeroupper + in BLOCK and its predecessor blocks recursively. */ + +static void +move_or_delete_vzeroupper_1 (basic_block block) +{ + edge e; + edge_iterator ei; + bool upper_128bits_set; + + if (dump_file) + fprintf (dump_file, " Process BB [%i]: status: %d\n", + block->index, BLOCK_INFO (block)->done); + + if (BLOCK_INFO (block)->done) + return; + + BLOCK_INFO (block)->done = true; + + upper_128bits_set = false; + + /* Process all predecessor edges of this block. */ + FOR_EACH_EDGE (e, ei, block->preds) + { + if (e->src == block) + continue; + move_or_delete_vzeroupper_1 (e->src); + if (BLOCK_INFO (e->src)->upper_128bits_set) + upper_128bits_set = true; + } + + /* Process this block. */ + move_or_delete_vzeroupper_2 (block, upper_128bits_set); +} + +/* Go through the instruction stream looking for vzeroupper. Delete + it if upper 128bit AVX registers are unused. If it isn't deleted, + move it to just before a jump insn. */ + +static void +move_or_delete_vzeroupper (void) +{ + edge e; + edge_iterator ei; + + /* Set up block info for each basic block. */ + alloc_aux_for_blocks (sizeof (struct block_info_def)); + + /* Process successor blocks of all entry points. */ + if (dump_file) + fprintf (dump_file, "Process all entry points\n"); + + FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs) + { + move_or_delete_vzeroupper_2 (e->dest, + cfun->machine->caller_pass_avx256_p); + BLOCK_INFO (e->dest)->done = true; + } + + /* Process predecessor blocks of all exit points. */ + if (dump_file) + fprintf (dump_file, "Process all exit points\n"); + + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds) + move_or_delete_vzeroupper_1 (e->src); + + free_aux_for_blocks (); +} + static rtx legitimize_dllimport_symbol (rtx, bool); #ifndef CHECK_STACK_LIMIT @@ -2633,6 +2920,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune, { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS }, { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS }, { "-m8bit-idiv", MASK_USE_8BIT_IDIV }, + { "-mvzeroupper", MASK_VZEROUPPER }, }; const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2]; @@ -3712,6 +4000,60 @@ ix86_option_override_internal (bool main_args_p) if (main_args_p) target_option_default_node = target_option_current_node = build_target_option_node (); + + if (TARGET_AVX) + { + /* Enable vzeroupper pass by default for TARGET_AVX. */ + if (!(target_flags_explicit & MASK_VZEROUPPER)) + target_flags |= MASK_VZEROUPPER; + } + else + { + /* Disable vzeroupper pass if TARGET_AVX is disabled. */ + target_flags &= ~MASK_VZEROUPPER; + } +} + +/* Return TRUE if type TYPE and mode MODE use 256bit AVX modes. */ + +static bool +use_avx256_p (enum machine_mode mode, const_tree type) +{ + return (VALID_AVX256_REG_MODE (mode) + || (type + && TREE_CODE (type) == VECTOR_TYPE + && int_size_in_bytes (type) == 32)); +} + +/* Return TRUE if VAL is passed in register with 256bit AVX modes. */ + +static bool +function_pass_avx256_p (const_rtx val) +{ + if (!val) + return false; + + if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val))) + return true; + + if (GET_CODE (val) == PARALLEL) + { + int i; + rtx r; + + for (i = XVECLEN (val, 0) - 1; i >= 0; i--) + { + r = XVECEXP (val, 0, i); + if (GET_CODE (r) == EXPR_LIST + && XEXP (r, 0) + && REG_P (XEXP (r, 0)) + && (GET_MODE (XEXP (r, 0)) == OImode + || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0))))) + return true; + } + } + + return false; } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -4626,7 +4968,14 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) return false; } else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) - ; + { + /* Disable sibcall if we need to generate vzeroupper after + callee returns. */ + if (TARGET_VZEROUPPER + && cfun->machine->callee_return_avx256_p + && !cfun->machine->caller_return_avx256_p) + return false; + } else if (!rtx_equal_p (a, b)) return false; @@ -5243,15 +5592,54 @@ void init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ tree fntype, /* tree ptr for function decl */ rtx libname, /* SYMBOL_REF of library name or 0 */ - tree fndecl) + tree fndecl, + int caller) { - struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL; + struct cgraph_local_info *i; + tree fnret_type; + memset (cum, 0, sizeof (*cum)); + /* Initialize for the current callee. */ + if (caller) + { + cfun->machine->callee_pass_avx256_p = false; + cfun->machine->callee_return_avx256_p = false; + } + if (fndecl) - cum->call_abi = ix86_function_abi (fndecl); + { + i = cgraph_local_info (fndecl); + cum->call_abi = ix86_function_abi (fndecl); + fnret_type = TREE_TYPE (TREE_TYPE (fndecl)); + } else - cum->call_abi = ix86_function_type_abi (fntype); + { + i = NULL; + cum->call_abi = ix86_function_type_abi (fntype); + if (fntype) + fnret_type = TREE_TYPE (fntype); + else + fnret_type = NULL; + } + + if (TARGET_VZEROUPPER && fnret_type) + { + rtx fnret_value = ix86_function_value (fnret_type, fntype, + false); + if (function_pass_avx256_p (fnret_value)) + { + /* The return value of this function uses 256bit AVX modes. */ + cfun->machine->use_avx256_p = true; + if (caller) + cfun->machine->callee_return_avx256_p = true; + else + cfun->machine->caller_return_avx256_p = true; + } + } + + cum->caller = caller; + /* Set up the number of registers to use for passing arguments. */ if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS) @@ -6488,6 +6876,7 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode, { enum machine_mode mode = omode; HOST_WIDE_INT bytes, words; + rtx arg; if (mode == BLKmode) bytes = int_size_in_bytes (type); @@ -6501,11 +6890,23 @@ ix86_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode, mode = type_natural_mode (type, cum); if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI) - return function_arg_ms_64 (cum, mode, omode, named, bytes); + arg = function_arg_ms_64 (cum, mode, omode, named, bytes); else if (TARGET_64BIT) - return function_arg_64 (cum, mode, omode, type, named); + arg = function_arg_64 (cum, mode, omode, type, named); else - return function_arg_32 (cum, mode, omode, type, bytes, words); + arg = function_arg_32 (cum, mode, omode, type, bytes, words); + + if (TARGET_VZEROUPPER && function_pass_avx256_p (arg)) + { + /* This argument uses 256bit AVX modes. */ + cfun->machine->use_avx256_p = true; + if (cum->caller) + cfun->machine->callee_pass_avx256_p = true; + else + cfun->machine->caller_pass_avx256_p = true; + } + + return arg; } /* A C expression that indicates when an argument must be passed by @@ -10353,6 +10754,15 @@ ix86_expand_epilogue (int style) return; } + /* Emit vzeroupper if needed. */ + if (TARGET_VZEROUPPER + && cfun->machine->use_avx256_p + && !cfun->machine->caller_return_avx256_p) + { + cfun->machine->use_vzeroupper_p = 1; + emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256))); + } + if (crtl->args.pops_args && crtl->args.size) { rtx popc = GEN_INT (crtl->args.pops_args); @@ -20910,6 +21320,25 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, + 2, vec)); } + /* Emit vzeroupper if needed. */ + if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p) + { + rtx avx256; + cfun->machine->use_vzeroupper_p = 1; + if (cfun->machine->callee_pass_avx256_p) + { + if (cfun->machine->callee_return_avx256_p) + avx256 = GEN_INT (callee_return_pass_avx256); + else + avx256 = GEN_INT (callee_pass_avx256); + } + else if (cfun->machine->callee_return_avx256_p) + avx256 = GEN_INT (callee_return_avx256); + else + avx256 = GEN_INT (call_no_avx256); + emit_insn (gen_avx_vzeroupper (avx256)); + } + call = emit_call_insn (call); if (use) CALL_INSN_FUNCTION_USAGE (call) = use; @@ -21653,6 +22082,9 @@ ix86_local_alignment (tree exp, enum machine_mode mode, decl = NULL; } + if (use_avx256_p (mode, type)) + cfun->machine->use_avx256_p = true; + /* Don't do dynamic stack realignment for long long objects with -mpreferred-stack-boundary=2. */ if (!TARGET_64BIT @@ -21748,9 +22180,6 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode, { tree type, decl; - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) - return align; - if (exp && DECL_P (exp)) { type = TREE_TYPE (exp); @@ -21762,6 +22191,12 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode, decl = NULL; } + if (use_avx256_p (mode, type)) + cfun->machine->use_avx256_p = true; + + if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + return align; + /* Don't do dynamic stack realignment for long long objects with -mpreferred-stack-boundary=2. */ if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) @@ -25505,6 +25940,8 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, switch ((enum ix86_builtin_func_type) d->flag) { case VOID_FTYPE_VOID: + if (icode == CODE_FOR_avx_vzeroupper) + target = GEN_INT (vzeroupper_intrinsic); emit_insn (GEN_FCN (icode) (target)); return 0; case VOID_FTYPE_UINT64: @@ -28542,6 +28979,10 @@ ix86_reorg (void) ix86_avoid_jump_mispredicts (); #endif } + + /* Run the vzeroupper optimization if needed. */ + if (cfun->machine->use_vzeroupper_p) + move_or_delete_vzeroupper (); } /* Return nonzero when QImode register that must be represented via REX prefix diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 25463a5..5474048 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1507,6 +1507,7 @@ typedef struct ix86_args { int mmx_nregs; /* # mmx registers available for passing */ int mmx_regno; /* next available mmx register number */ int maybe_vaarg; /* true for calls to possibly vardic fncts. */ + int caller; /* true if it is caller. */ int float_in_sse; /* Set to 1 or 2 for 32bit targets if SFmode/DFmode arguments should be passed in SSE registers. Otherwise 0. */ @@ -1519,7 +1520,8 @@ typedef struct ix86_args { For a library call, FNTYPE is 0. */ #define INIT_CUMULATIVE_ARGS(CUM, FNTYPE, LIBNAME, FNDECL, N_NAMED_ARGS) \ - init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL)) + init_cumulative_args (&(CUM), (FNTYPE), (LIBNAME), (FNDECL), \ + (N_NAMED_ARGS) != -1) /* Output assembler code to FILE to increment profiler label # LABELNO for profiling a function entry. */ @@ -2289,6 +2291,24 @@ struct GTY(()) machine_function { stack below the return address. */ BOOL_BITFIELD static_chain_on_stack : 1; + /* Nonzero if the current function uses vzeroupper. */ + BOOL_BITFIELD use_vzeroupper_p : 1; + + /* Nonzero if the current function uses 256bit AVX regisers. */ + BOOL_BITFIELD use_avx256_p : 1; + + /* Nonzero if caller passes 256bit AVX modes. */ + BOOL_BITFIELD caller_pass_avx256_p : 1; + + /* Nonzero if caller returns 256bit AVX modes. */ + BOOL_BITFIELD caller_return_avx256_p : 1; + + /* Nonzero if the current callee passes 256bit AVX modes. */ + BOOL_BITFIELD callee_pass_avx256_p : 1; + + /* Nonzero if the current callee returns 256bit AVX modes. */ + BOOL_BITFIELD callee_return_avx256_p : 1; + /* During prologue/epilogue generation, the current frame state. Otherwise, the frame state at the end of the prologue. */ struct machine_frame_state fs; diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 9c1fe1f..28a921f 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -256,6 +256,11 @@ mcld Target Report Mask(CLD) Save Generate cld instruction in the function prologue. +mvzeroupper +Target Report Mask(VZEROUPPER) Save +Generate vzeroupper instruction before a transfer of control flow out of +the function. + mfused-madd Target Report Mask(FUSED_MADD) Save Enable automatic generation of fused floating point multiply-add instructions diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ffddf18..078fac6 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11508,29 +11508,11 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -;; vzeroupper clobbers the upper 128bits of AVX registers. -(define_expand "avx_vzeroupper" - [(match_par_dup 0 [(const_int 0)])] - "TARGET_AVX" -{ - int nregs = TARGET_64BIT ? 16 : 8; - int regno; - - operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1)); - - XVECEXP (operands[0], 0, 0) - = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx), - UNSPECV_VZEROUPPER); - - for (regno = 0; regno < nregs; regno++) - XVECEXP (operands[0], 0, regno + 1) - = gen_rtx_CLOBBER (VOIDmode, - gen_rtx_REG (V8SImode, SSE_REGNO (regno))); -}) - -(define_insn "*avx_vzeroupper" - [(match_parallel 0 "vzeroupper_operation" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] +;; Clear the upper 128bits of AVX registers, equivalent to a NOP +;; if the upper 128bits are unused. +(define_insn "avx_vzeroupper" + [(unspec_volatile [(match_operand 0 "const_int_operand" "")] + UNSPECV_VZEROUPPER)] "TARGET_AVX" "vzeroupper" [(set_attr "type" "sse") diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 7ea042f..365b8c3 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -594,7 +594,7 @@ Objective-C and Objective-C++ Dialects}. -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} -mincoming-stack-boundary=@var{num} @gol --mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip @gol +-mcld -mcx16 -msahf -mmovbe -mcrc32 -mrecip -mvzeroupper @gol -mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 -mavx @gol -maes -mpclmul -mfsgsbase -mrdrnd -mf16c -mfused-madd @gol -msse4a -m3dnow -mpopcnt -mabm -mfma4 -mxop -mlwp @gol @@ -12466,6 +12466,13 @@ GCC with the @option{--enable-cld} configure option. Generation of @code{cld} instructions can be suppressed with the @option{-mno-cld} compiler option in this case. +@item -mvzeroupper +@opindex mvzeroupper +This option instructs GCC to emit a @code{vzeroupper} instruction +before a transfer of control flow out of the function to minimize +AVX to SSE transition penalty as well as remove unnecessary zeroupper +intrinsics. + @item -mcx16 @opindex mcx16 This option will enable GCC to use CMPXCHG16B instruction in generated code. diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c index 2137c25..73ce795 100644 --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-1.c @@ -1,6 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target avx } */ -/* { dg-options "-O2 -mavx" } */ +/* { dg-options "-O2 -mavx -mtune=generic" } */ #include "avx-check.h" diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c new file mode 100644 index 0000000..5007753 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-10.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern float x, y; + +void +foo () +{ + x = y; + _mm256_zeroupper (); + _mm256_zeroupper (); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c new file mode 100644 index 0000000..507f945 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-11.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern float x, y; + +void +foo () +{ + x = y; + _mm256_zeroall (); + _mm256_zeroupper (); + _mm256_zeroupper (); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ +/* { dg-final { scan-assembler-times "avx_vzeroupper" 3 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c new file mode 100644 index 0000000..f74ea0c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-12.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + _mm256_zeroall (); + _mm256_zeroupper (); + x = y; + _mm256_zeroupper (); + _mm256_zeroupper (); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ +/* { dg-final { scan-assembler-times "\\*avx_vzeroall" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c new file mode 100644 index 0000000..cff5f88 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-13.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mno-vzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + x = y; +} + +/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c new file mode 100644 index 0000000..e74bc24 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mtune=generic -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + x = y; +} + +/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c index 9771e6c..66df90f 100644 --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-2.c @@ -1,6 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target avx } */ -/* { dg-options "-O2 -mavx" } */ +/* { dg-options "-O2 -mavx -mtune=generic" } */ #include "avx-check.h" diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c new file mode 100644 index 0000000..8053d78 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-3.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O2 -mavx -mvzeroupper" } */ + +#include "avx-check.h" + +int s[8] = {1, 2, 3, 4, 5, 6, 7, 8}; +int d[8] = {11, 22, 33, 44, 55, 66, 77, 88}; + +void +__attribute__((noinline)) +foo () +{ + int i; + for (i = 0; i < ARRAY_SIZE (d); i++) + d[i] = s[i] + 0x1000; +} + +static void +__attribute__((noinline)) +bar (__m256i src) +{ + foo (); + _mm256_storeu_si256 ((__m256i*) d, src); + if (__builtin_memcmp (d, s, sizeof (d))) + abort (); +} + +static void +avx_test (void) +{ + __m256i src = _mm256_loadu_si256 ((__m256i*) s); + bar (src); +} diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c new file mode 100644 index 0000000..c55c814 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-4.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__)); + +extern void bar2 (__m256); +extern __m256 y; + +void +foo () +{ + bar2 (y); +} + +/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c new file mode 100644 index 0000000..a14460c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-5.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern void bar2 (__m256); +extern __m256 y; + +void +foo () +{ + bar2 (y); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c new file mode 100644 index 0000000..ada87bd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-6.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + x = y; + _mm256_zeroall (); +} + +/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c new file mode 100644 index 0000000..ab6d687 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-7.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + x = y; + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c new file mode 100644 index 0000000..0a821c2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-8.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + x = y; + _mm256_zeroall (); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-not "avx_vzeroupper" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c new file mode 100644 index 0000000..5aa05b8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-9.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -mavx -mvzeroupper -dp" } */ + +#include <immintrin.h> + +extern __m256 x, y; + +void +foo () +{ + _mm256_zeroupper (); + x = y; + _mm256_zeroupper (); + _mm256_zeroupper (); + _mm256_zeroupper (); +} + +/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */

PATCH: Add vzeroupper optimization for AVX

Commit Message

Comments

Patch