@@ -141,6 +141,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
def_or_undef (parse_in, "__corei7");
def_or_undef (parse_in, "__corei7__");
break;
+ case PROCESSOR_COREI7_AVX:
+ def_or_undef (parse_in, "__corei7_avx");
+ def_or_undef (parse_in, "__corei7_avx__");
+ break;
case PROCESSOR_HASWELL:
def_or_undef (parse_in, "__core_avx2");
def_or_undef (parse_in, "__core_avx2__");
@@ -239,6 +243,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
case PROCESSOR_COREI7:
def_or_undef (parse_in, "__tune_corei7__");
break;
+ case PROCESSOR_COREI7_AVX:
+ def_or_undef (parse_in, "__tune_corei7_avx__");
+ break;
case PROCESSOR_HASWELL:
def_or_undef (parse_in, "__tune_core_avx2__");
break;
@@ -1908,8 +1908,9 @@ const struct processor_costs *ix86_cost = &pentium_cost;
#define m_P4_NOCONA (m_PENT4 | m_NOCONA)
#define m_CORE2 (1<<PROCESSOR_CORE2)
#define m_COREI7 (1<<PROCESSOR_COREI7)
+#define m_COREI7_AVX (1<<PROCESSOR_COREI7_AVX)
#define m_HASWELL (1<<PROCESSOR_HASWELL)
-#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
+#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_COREI7_AVX | m_HASWELL)
#define m_ATOM (1<<PROCESSOR_ATOM)
#define m_SLM (1<<PROCESSOR_SLM)
@@ -1984,10 +1985,10 @@ static const unsigned int x86_arch_always_fancy_math_387
= m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM |
m_AMD_MULTIPLE | m_GENERIC;
static const unsigned int x86_avx256_split_unaligned_load
- = m_COREI7 | m_GENERIC;
+ = m_COREI7 | m_COREI7_AVX | m_GENERIC;
static const unsigned int x86_avx256_split_unaligned_store
- = m_COREI7 | m_BDVER | m_GENERIC;
+ = m_COREI7 | m_COREI7_AVX | m_BDVER | m_GENERIC;
/* In case the average insn count for single function invocation is
lower than this constant, emit fast (but longer) prologue and
@@ -2377,6 +2378,8 @@ static const struct ptt
processor_target_table[PROCESSOR_max] =
{&core_cost, 16, 10, 16, 10, 16},
/* Core i7 */
{&core_cost, 16, 10, 16, 10, 16},
+ /* Core i7 avx */
+ {&core_cost, 16, 10, 16, 10, 16},
/* Core avx2 */
{&core_cost, 16, 10, 16, 10, 16},
{&generic32_cost, 16, 7, 16, 7, 16},
@@ -2407,6 +2410,7 @@ static const char *const
cpu_names[TARGET_CPU_DEFAULT_max] =
"nocona",
"core2",
"corei7",
+ "corei7-avx",
"core-avx2",
"atom",
"slm",
@@ -3091,12 +3095,12 @@ ix86_option_override_internal (bool main_args_p)
{"corei7", PROCESSOR_COREI7, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
| PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
- {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
+ {"corei7-avx", PROCESSOR_COREI7_AVX, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
| PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
- {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
+ {"core-avx-i", PROCESSOR_COREI7_AVX, CPU_COREI7,
PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
| PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
| PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
@@ -24477,6 +24481,7 @@ ix86_issue_rate (void)
case PROCESSOR_PENTIUM4:
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_COREI7_AVX:
case PROCESSOR_HASWELL:
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
@@ -24834,6 +24839,7 @@ ia32_multipass_dfa_lookahead (void)
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_COREI7_AVX:
case PROCESSOR_HASWELL:
case PROCESSOR_ATOM:
case PROCESSOR_SLM:
@@ -25474,6 +25480,7 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
{
case PROCESSOR_CORE2:
case PROCESSOR_COREI7:
+ case PROCESSOR_COREI7_AVX:
case PROCESSOR_HASWELL:
/* Do not perform multipass scheduling for pre-reload schedule
to save compile time. */
@@ -29324,6 +29331,10 @@ get_builtin_code_for_version (tree decl, tree
*predicate_list)
arg_str = "corei7";
priority = P_PROC_SSE4_2;
break;
+ case PROCESSOR_COREI7_AVX:
+ arg_str = "corei7-avx";
+ priority = P_PROC_SSE4_2;
+ break;
case PROCESSOR_ATOM:
arg_str = "atom";
priority = P_PROC_SSSE3;
@@ -250,6 +250,7 @@ extern const struct processor_costs ix86_size_cost;
#define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA)
#define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2)
#define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7)
+#define TARGET_COREI7_AVX (ix86_tune == PROCESSOR_COREI7_AVX)
#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL)
#define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32)
#define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64)
@@ -559,6 +560,7 @@ enum target_cpu_default
TARGET_CPU_DEFAULT_nocona,
TARGET_CPU_DEFAULT_core2,
TARGET_CPU_DEFAULT_corei7,
+ TARGET_CPU_DEFAULT_corei7_avx,
TARGET_CPU_DEFAULT_haswell,
TARGET_CPU_DEFAULT_atom,
TARGET_CPU_DEFAULT_slm,
@@ -2118,6 +2120,7 @@ enum processor_type
PROCESSOR_NOCONA,
PROCESSOR_CORE2,
PROCESSOR_COREI7,
+ PROCESSOR_COREI7_AVX,
PROCESSOR_HASWELL,
PROCESSOR_GENERIC32,
PROCESSOR_GENERIC64,
@@ -118,9 +118,9 @@ DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY,
"sse_partial_reg_dependency",
m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_SLM | m_AMDFAM10
| m_BDVER | m_GENERIC)
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal",
- m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM)
+ m_COREI7 | m_COREI7_AVX | m_AMDFAM10 | m_BDVER | m_BTVER | m_SLM)
DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal",
- m_COREI7 | m_BDVER | m_SLM)
+ m_COREI7 | m_COREI7_AVX | m_BDVER | m_SLM)
DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL,
"sse_packed_single_insn_optimal",
m_BDVER)
/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
Patch2:
2013-09-16 Wei Mi <wmi@google.com>
* gcc/config/i386/i386.c (ix86_macro_fusion_p): New Function.
(ix86_macro_fusion_pair_p): Ditto.
* gcc/config/i386/i386.h: Add new tune features about macro-fusion.
* gcc/config/i386/x86-tune.def (DEF_TUNE): Ditto.
* gcc/doc/tm.texi: Generated.
* gcc/doc/tm.texi.in: Ditto.
* gcc/haifa-sched.c (try_group_insn): New function.
(group_insns_for_macro_fusion): Ditto.
(sched_init): Call group_insns_for_macro_fusion.
* gcc/sched-rgn.c (add_branch_dependences): Keep insns in
a SCHED_GROUP at the end of BB to remain their location.
* gcc/target.def: Add two hooks: macro_fusion_p and
macro_fusion_pair_p.
@@ -24856,6 +24856,90 @@ ia32_multipass_dfa_lookahead (void)
}
}
+/* Return true if target platform supports macro-fusion. */
+
+static bool
+ix86_macro_fusion_p ()
+{
+ if (TARGET_FUSE_CMP_AND_BRANCH
+ && (!TARGET_64BIT || TARGET_FUSE_CMP_AND_BRANCH_64))
+ return true;
+ else
+ return false;
+}
+
+/* Check whether current microarchitecture support macro fusion
+ for insn pair "CONDGEN + CONDJMP". Refer to
+ "Intel Architectures Optimization Reference Manual". */
+
+static bool
+ix86_macro_fusion_pair_p (rtx condgen, rtx condjmp)
+{
+ rtx src;
+ rtx single_set = single_set (condgen);
+ enum rtx_code ccode;
+ rtx compare_set = NULL_RTX, test_if, cond;
+
+ if (single_set == NULL_RTX
+ && !TARGET_FUSE_ALU_AND_BRANCH)
+ return false;
+
+ if (single_set != NULL_RTX)
+ compare_set = single_set;
+ else
+ {
+ int i;
+ rtx pat = PATTERN (condgen);
+ for (i = 0; i < XVECLEN (pat, 0); i++)
+ if (GET_CODE (XVECEXP (pat, 0, i)) == SET
+ && GET_CODE (SET_SRC (XVECEXP (pat, 0, i))) == COMPARE)
+ compare_set = XVECEXP (pat, 0, i);
+ }
+ if (compare_set == NULL_RTX)
+ return false;
+ src = SET_SRC (compare_set);
+ if (GET_CODE (src) != COMPARE)
+ return false;
+
+ /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
+ supported. */
+ if ((MEM_P (XEXP (src, 0))
+ && CONST_INT_P (XEXP (src, 1)))
+ || (MEM_P (XEXP (src, 1))
+ && CONST_INT_P (XEXP (src, 0))))
+ return false;
+
+ test_if = SET_SRC (pc_set (condjmp));
+ cond = XEXP (test_if, 0);
+ ccode = GET_CODE (cond);
+ /* Check whether conditional jump use Sign or Overflow Flags. */
+ if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
+ && (ccode == GE
+ || ccode == GT
+ || ccode == LE
+ || ccode == LT))
+ return false;
+
+ if (get_attr_type (condgen) == TYPE_TEST
+ || get_attr_type (condgen) == TYPE_ICMP)
+ return true;
+
+ /* The following is the case that macro-fusion for alu + jmp. */
+ if (!TARGET_FUSE_ALU_AND_BRANCH)
+ return false;
+
+ /* Macro-fusion for inc/dec + unsigned conditional jump is not
+ supported. */
+ if (get_attr_type (condgen) == TYPE_INCDEC
+ && (ccode == GEU
+ || ccode == GTU
+ || ccode == LEU
+ || ccode == LTU))
+ return false;
+
+ return true;
+}
+
/* Try to reorder ready list to take advantage of Atom pipelined IMUL
execution. It is applied if
(1) IMUL instruction is on the top of list;
@@ -42993,6 +43077,10 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
ia32_multipass_dfa_lookahead
+#undef TARGET_SCHED_MACRO_FUSION_P
+#define TARGET_SCHED_MACRO_FUSION_P ix86_macro_fusion_p
+#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
+#define TARGET_SCHED_MACRO_FUSION_PAIR_P ix86_macro_fusion_pair_p
#undef TARGET_FUNCTION_OK_FOR_SIBCALL
#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
@@ -364,6 +364,12 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_USE_VECTOR_CONVERTS]
#define TARGET_FUSE_CMP_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH]
+#define TARGET_FUSE_CMP_AND_BRANCH_64 \
+ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_64]
+#define TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS \
+ ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
+#define TARGET_FUSE_ALU_AND_BRANCH \
+ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
#define TARGET_VECTORIZE_DOUBLE \
ix86_tune_features[X86_TUNE_VECTORIZE_DOUBLE]
@@ -196,7 +196,22 @@ DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
"use_vector_converts", m_AMDFAM10)
/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
with a subsequent conditional jump instruction into a single
compare-and-branch uop. */
-DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch", m_BDVER)
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH, "fuse_cmp_and_branch",
+ m_CORE_ALL | m_BDVER)
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent
+ conditional jump instruction for TARGET_64BIT. */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64",
+ m_COREI7 | m_COREI7_AVX | m_HASWELL)
+/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a
+ subsequent conditional jump instruction when the condition jump
+ check sign flag (SF) or overflow flag (OF). */
+DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",
+ m_COREI7 | m_COREI7_AVX | m_HASWELL)
+/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
+ jump instruction when the alu instruction produces the CCFLAG consumed by
+ the conditional jump instruction. */
+DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
+ m_COREI7_AVX | m_HASWELL)
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_ATOM | m_SLM)
@@ -6553,6 +6553,17 @@ scheduling one insn causes other insns to
become ready in the same
cycle. These other insns can then be taken into account properly.
@end deftypefn
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_P (void)
+This hook is used to check whether target platform supports macro fusion.
+@end deftypefn
+
+@deftypefn {Target Hook} bool TARGET_SCHED_MACRO_FUSION_PAIR_P (rtx
@var{condgen}, rtx @var{condjmp})
+This hook is used to check whether two insns could be macro fused for
+target microarchitecture. If this hook returns true for the given insn pair
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched
+group, and they will not be scheduled apart.
+@end deftypefn
+
@deftypefn {Target Hook} void
TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK (rtx @var{head}, rtx
@var{tail})
This hook is called after evaluation forward dependencies of insns in
chain given by two parameter values (@var{head} and @var{tail}
@@ -4940,6 +4940,10 @@ them: try the first ones in this list first.
@hook TARGET_SCHED_REORDER2
+@hook TARGET_SCHED_MACRO_FUSION_P
+
+@hook TARGET_SCHED_MACRO_FUSION_PAIR_P
+
@hook TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
@hook TARGET_SCHED_INIT
@@ -6519,6 +6519,44 @@ setup_sched_dump (void)
? stderr : dump_file);
}
+static void
+try_group_insn (rtx insn)
+{
+ unsigned int condreg1, condreg2;
+ rtx cc_reg_1;
+ rtx prev;
+
+ targetm.fixed_condition_code_regs (&condreg1, &condreg2);
+ cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
+ prev = prev_nonnote_nondebug_insn (insn);
+ if (!any_condjump_p (insn)
+ || !reg_referenced_p (cc_reg_1, PATTERN (insn))
+ || !prev
+ || !modified_in_p (cc_reg_1, prev))
+ return;
+
+ /* Different microarchitectures support macro fusions for different
+ combinations of insn pairs. */
+ if (!targetm.sched.macro_fusion_pair_p
+ || !targetm.sched.macro_fusion_pair_p (prev, insn))
+ return;
+
+ SCHED_GROUP_P (insn) = 1;
+}
+
+/* If the last cond jump and the cond register defining insn are consecutive
+ before scheduling, we want them to be in a schedule group. This is good
+ for performance on microarchitectures supporting macro-fusion. */
+
+static void
+group_insns_for_macro_fusion ()
+{
+ basic_block bb;
+
+ FOR_EACH_BB (bb)
+ try_group_insn (BB_END (bb));
+}
+
/* Initialize some global state for the scheduler. This function works
with the common data shared between all the schedulers. It is called
from the scheduler specific initialization routine. */
@@ -6645,6 +6683,11 @@ sched_init (void)
}
curr_state = xmalloc (dfa_state_size);
+
+ /* Group compare and branch insns for macro-fusion. */
+ if (targetm.sched.macro_fusion_p
+ && targetm.sched.macro_fusion_p ())
+ group_insns_for_macro_fusion ();
}
static void haifa_init_only_bb (basic_block, basic_block);
@@ -2443,6 +2443,8 @@ add_branch_dependences (rtx head, rtx tail)
cc0 setters remain at the end because they can't be moved away from
their cc0 user.
+ Predecessors of SCHED_GROUP_P instructions at the end remain at the end.
+
COND_EXEC insns cannot be moved past a branch (see e.g. PR17808).
Insns setting TARGET_CLASS_LIKELY_SPILLED_P registers (usually return
@@ -2465,7 +2467,8 @@ add_branch_dependences (rtx head, rtx tail)
#endif
|| (!reload_completed
&& sets_likely_spilled (PATTERN (insn)))))
- || NOTE_P (insn))
+ || NOTE_P (insn)
+ || (last != 0 && SCHED_GROUP_P (last)))
{
if (!NOTE_P (insn))
{
@@ -1041,6 +1041,19 @@ scheduling one insn causes other insns to
become ready in the same\n\
cycle. These other insns can then be taken into account properly.",
int, (FILE *file, int verbose, rtx *ready, int *n_readyp, int clock), NULL)
+DEFHOOK
+(macro_fusion_p,
+ "This hook is used to check whether target platform supports macro fusion.",
+ bool, (void), NULL)
+
+DEFHOOK
+(macro_fusion_pair_p,
+ "This hook is used to check whether two insns could be macro fused for\n\
+target microarchitecture. If this hook returns true for the given insn pair\n\
+(@var{condgen} and @var{condjmp}), scheduler will put them into a sched\n\
+group, and they will not be scheduled apart.",
+ bool, (rtx condgen, rtx condjmp), NULL)
+
/* The following member value is a pointer to a function called
after evaluation forward dependencies of insns in chain given