Message ID | 20240704012809.2385444-1-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [V2] x86: Update branch hint for Redwood Cove. | expand |
On Thu, Jul 4, 2024 at 9:30 AM liuhongt <hongtao.liu@intel.com> wrote: > > From: "H.J. Lu" <hjl.tools@gmail.com> > > >The above reads like it would be worth splitting branc_prediction_hits > >into branch_prediction_hints_taken and branch_prediction_hints_not_taken > >given not-taken is the default and thus will just increase code size? > >According to Intel® 64 and IA-32 Architectures Optimization Reference > >Manual[1], Branch Hint is updated for Redwood Cove. > Changed. > > --------cut from [1]------------------------- > Starting with the Redwood Cove microarchitecture, if the predictor has > no stored information about a branch, the branch has the Intel® SSE2 > branch taken hint (i.e., instruction prefix 3EH), When the codec > decodes the branch, it flips the branch’s prediction from not-taken to > taken. It then flushes the pipeline in front of it and steers this > pipeline to fetch the taken path of the branch. > --------cut end ----------------------------- > > Split tune branch_prediction_hints into branch_prediction_hints_taken > and branch_prediction_hints_not_taken, always generate branch hint for > conditional branches, both tunes are disabled by default. > > [1] https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? Committed. > > gcc/ > > * config/i386/i386.cc (ix86_print_operand): Always generate > branch hint for conditional branches. > * config/i386/i386.h (TARGET_BRANCH_PREDICTION_HINTS): Split > into .. > (TARGET_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and .. > (TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this. > * config/i386/x86-tune.def (X86_TUNE_BRANCH_PREDICTION_HINTS): > Split into .. > (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and .. > (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this. > --- > gcc/config/i386/i386.cc | 29 +++++++++-------------------- > gcc/config/i386/i386.h | 6 ++++-- > gcc/config/i386/x86-tune.def | 13 +++++++++++-- > 3 files changed, 24 insertions(+), 24 deletions(-) > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 1f71ed04be6..ea9cb620f8d 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -14041,7 +14041,8 @@ ix86_print_operand (FILE *file, rtx x, int code) > > if (!optimize > || optimize_function_for_size_p (cfun) > - || !TARGET_BRANCH_PREDICTION_HINTS) > + || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN > + && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN)) > return; > > x = find_reg_note (current_output_insn, REG_BR_PROB, 0); > @@ -14050,25 +14051,13 @@ ix86_print_operand (FILE *file, rtx x, int code) > int pred_val = profile_probability::from_reg_br_prob_note > (XINT (x, 0)).to_reg_br_prob_base (); > > - if (pred_val < REG_BR_PROB_BASE * 45 / 100 > - || pred_val > REG_BR_PROB_BASE * 55 / 100) > - { > - bool taken = pred_val > REG_BR_PROB_BASE / 2; > - bool cputaken > - = final_forward_branch_p (current_output_insn) == 0; > - > - /* Emit hints only in the case default branch prediction > - heuristics would fail. */ > - if (taken != cputaken) > - { > - /* We use 3e (DS) prefix for taken branches and > - 2e (CS) prefix for not taken branches. */ > - if (taken) > - fputs ("ds ; ", file); > - else > - fputs ("cs ; ", file); > - } > - } > + bool taken = pred_val > REG_BR_PROB_BASE / 2; > + /* We use 3e (DS) prefix for taken branches and > + 2e (CS) prefix for not taken branches. */ > + if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN) > + fputs ("ds ; ", file); > + else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN) > + fputs ("cs ; ", file); > } > return; > } > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 9ed225ec587..50ebed221dc 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -309,8 +309,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; > #define TARGET_ZERO_EXTEND_WITH_AND \ > ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND] > #define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN] > -#define TARGET_BRANCH_PREDICTION_HINTS \ > - ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS] > +#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \ > + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN] > +#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \ > + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN] > #define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD] > #define TARGET_USE_SAHF ix86_tune_features[X86_TUNE_USE_SAHF] > #define TARGET_MOVX ix86_tune_features[X86_TUNE_MOVX] > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > index 343c32c291f..3d29bffc49c 100644 > --- a/gcc/config/i386/x86-tune.def > +++ b/gcc/config/i386/x86-tune.def > @@ -683,15 +683,24 @@ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) > DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", > m_K8) > > +/* X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, starting with the Redwood Cove > + microarchitecture, if the predictor has no stored information about a branch, > + the branch has the Intel® SSE2 branch taken hint > + (i.e., instruction prefix 3EH), When the codec decodes the branch, it flips > + the branch’s prediction from not-taken to taken. It then flushes the pipeline > + in front of it and steers this pipeline to fetch the taken path of the > + branch. */ > +DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, "branch_prediction_hints_taken", m_NONE) > + > /*****************************************************************************/ > /* This never worked well before. */ > /*****************************************************************************/ > > -/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based > +/* X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN: Branch hints were put in P4 based > on simulation result. But after P4 was made, no performance benefit > was observed with branch hints. It also increases the code size. > As a result, icc never generates branch hints. */ > -DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE) > +DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN, "branch_prediction_hints_not_taken", m_NONE) > > /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ > DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL) > -- > 2.31.1 >
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 1f71ed04be6..ea9cb620f8d 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -14041,7 +14041,8 @@ ix86_print_operand (FILE *file, rtx x, int code) if (!optimize || optimize_function_for_size_p (cfun) - || !TARGET_BRANCH_PREDICTION_HINTS) + || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN + && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN)) return; x = find_reg_note (current_output_insn, REG_BR_PROB, 0); @@ -14050,25 +14051,13 @@ ix86_print_operand (FILE *file, rtx x, int code) int pred_val = profile_probability::from_reg_br_prob_note (XINT (x, 0)).to_reg_br_prob_base (); - if (pred_val < REG_BR_PROB_BASE * 45 / 100 - || pred_val > REG_BR_PROB_BASE * 55 / 100) - { - bool taken = pred_val > REG_BR_PROB_BASE / 2; - bool cputaken - = final_forward_branch_p (current_output_insn) == 0; - - /* Emit hints only in the case default branch prediction - heuristics would fail. */ - if (taken != cputaken) - { - /* We use 3e (DS) prefix for taken branches and - 2e (CS) prefix for not taken branches. */ - if (taken) - fputs ("ds ; ", file); - else - fputs ("cs ; ", file); - } - } + bool taken = pred_val > REG_BR_PROB_BASE / 2; + /* We use 3e (DS) prefix for taken branches and + 2e (CS) prefix for not taken branches. */ + if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN) + fputs ("ds ; ", file); + else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN) + fputs ("cs ; ", file); } return; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 9ed225ec587..50ebed221dc 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -309,8 +309,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; #define TARGET_ZERO_EXTEND_WITH_AND \ ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND] #define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN] -#define TARGET_BRANCH_PREDICTION_HINTS \ - ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS] +#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \ + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN] +#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \ + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN] #define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD] #define TARGET_USE_SAHF ix86_tune_features[X86_TUNE_USE_SAHF] #define TARGET_MOVX ix86_tune_features[X86_TUNE_MOVX] diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 343c32c291f..3d29bffc49c 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -683,15 +683,24 @@ DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", m_K8) +/* X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, starting with the Redwood Cove + microarchitecture, if the predictor has no stored information about a branch, + the branch has the Intel® SSE2 branch taken hint + (i.e., instruction prefix 3EH), When the codec decodes the branch, it flips + the branch’s prediction from not-taken to taken. It then flushes the pipeline + in front of it and steers this pipeline to fetch the taken path of the + branch. */ +DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN, "branch_prediction_hints_taken", m_NONE) + /*****************************************************************************/ /* This never worked well before. */ /*****************************************************************************/ -/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based +/* X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN: Branch hints were put in P4 based on simulation result. But after P4 was made, no performance benefit was observed with branch hints. It also increases the code size. As a result, icc never generates branch hints. */ -DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", m_NONE) +DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN, "branch_prediction_hints_not_taken", m_NONE) /* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", m_ALL)