Message ID | 20120330141858.420D5615CC@tjsboxrox.mtv.corp.google.com |
---|---|
State | New |
Headers | show |
I should add that I have tested performance of this on Core2, Corei7 (Nehalem) and AMD Opteron-based systems. It appears to be performance-neutral on AMD (only minor perturbations, overall a wash). For the test case that provoked the optimization, there were nice improvements on Core2 and Corei7. Thanks, Teresa On Fri, Mar 30, 2012 at 7:18 AM, Teresa Johnson <tejohnson@google.com> wrote: > This patch addresses instructions that incur expensive length-changing prefix (LCP) stalls > on some x86-64 implementations, notably Core2 and Corei7. Specifically, a move of > a 16-bit constant into memory requires a length-changing prefix and can incur significant > penalties. The attached patch avoids this by forcing such instructions to be split into > two: a move of the corresponding 32-bit constant into a register, and a move of the > register's lower 16 bits into memory. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > Thanks, > Teresa > > 2012-03-29 Teresa Johnson <tejohnson@google.com> > > * config/i386/i386.h (ix86_tune_indices): Add > X86_TUNE_LCP_STALL. > * config/i386/i386.md (movhi_internal): Split to > movhi_internal and movhi_imm_internal. > * config/i386/i386.c (initial_ix86_tune_features): Initialize > X86_TUNE_LCP_STALL entry. > > Index: config/i386/i386.h > =================================================================== > --- config/i386/i386.h (revision 185920) > +++ config/i386/i386.h (working copy) > @@ -262,6 +262,7 @@ enum ix86_tune_indices { > X86_TUNE_MOVX, > X86_TUNE_PARTIAL_REG_STALL, > X86_TUNE_PARTIAL_FLAG_REG_STALL, > + X86_TUNE_LCP_STALL, > X86_TUNE_USE_HIMODE_FIOP, > X86_TUNE_USE_SIMODE_FIOP, > X86_TUNE_USE_MOV0, > @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L > #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] > #define TARGET_PARTIAL_FLAG_REG_STALL \ > ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] > +#define TARGET_LCP_STALL \ > + ix86_tune_features[X86_TUNE_LCP_STALL] > #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] > #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] > #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] > Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 185920) > +++ config/i386/i386.md (working copy) > @@ -2262,9 +2262,19 @@ > ] > (const_string "SI")))]) > > +(define_insn "*movhi_imm_internal" > + [(set (match_operand:HI 0 "memory_operand" "=m") > + (match_operand:HI 1 "immediate_operand" "n"))] > + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" > +{ > + return "mov{w}\t{%1, %0|%0, %1}"; > +} > + [(set (attr "type") (const_string "imov")) > + (set (attr "mode") (const_string "HI"))]) > + > (define_insn "*movhi_internal" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") > - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] > + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > { > switch (get_attr_type (insn)) > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 185920) > +++ config/i386/i386.c (working copy) > @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > m_CORE2I7 | m_GENERIC, > > + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > + * on 16-bit immediate moves into memory on Core2 and Corei7, > + * which may also affect AMD implementations. */ > + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, > + > /* X86_TUNE_USE_HIMODE_FIOP */ > m_386 | m_486 | m_K6_GEODE, > > > -- > This patch is available for review at http://codereview.appspot.com/5975045
> Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 185920) > +++ config/i386/i386.md (working copy) > @@ -2262,9 +2262,19 @@ > ] > (const_string "SI")))]) > > +(define_insn "*movhi_imm_internal" > + [(set (match_operand:HI 0 "memory_operand" "=m") > + (match_operand:HI 1 "immediate_operand" "n"))] > + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" > +{ > + return "mov{w}\t{%1, %0|%0, %1}"; > +} > + [(set (attr "type") (const_string "imov")) > + (set (attr "mode") (const_string "HI"))]) > + > (define_insn "*movhi_internal" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") > - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] > + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] If you do this, you will prevent reload from considering using immediate as rematerializatoin when the register holding constant is on a stack on !TARGET_LCP_STALL machines. The matching pattern for moves should really handle all available alternatives, so reload is happy. You can duplicate the pattern, but I think this is much better to be done as post-reload peephole2. I.e. ask for scratch register and if it is available do the splitting. This way optimization won't happen when there is no register available and we will also rely on post-reload cleanups to unify moves of constant, but I think this should work well. You also want to conditionalize the split by optimize_insn_for_speed, too. > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > { > switch (get_attr_type (insn)) > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 185920) > +++ config/i386/i386.c (working copy) > @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > m_CORE2I7 | m_GENERIC, > > + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > + * on 16-bit immediate moves into memory on Core2 and Corei7, > + * which may also affect AMD implementations. */ > + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, Is this supposed to help AMD? (at least the pre-buldozer design should not care about length changing prefixes that much because it tags sizes in the cache). If not, I would suggest enabling it only for cores and generic. Honza
Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 185920) +++ config/i386/i386.h (working copy) @@ -262,6 +262,7 @@ enum ix86_tune_indices { X86_TUNE_MOVX, X86_TUNE_PARTIAL_REG_STALL, X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_LCP_STALL, X86_TUNE_USE_HIMODE_FIOP, X86_TUNE_USE_SIMODE_FIOP, X86_TUNE_USE_MOV0, @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] #define TARGET_PARTIAL_FLAG_REG_STALL \ ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_LCP_STALL \ + ix86_tune_features[X86_TUNE_LCP_STALL] #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 185920) +++ config/i386/i386.md (working copy) @@ -2262,9 +2262,19 @@ ] (const_string "SI")))]) +(define_insn "*movhi_imm_internal" + [(set (match_operand:HI 0 "memory_operand" "=m") + (match_operand:HI 1 "immediate_operand" "n"))] + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + return "mov{w}\t{%1, %0|%0, %1}"; +} + [(set (attr "type") (const_string "imov")) + (set (attr "mode") (const_string "HI"))]) + (define_insn "*movhi_internal" [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] "!(MEM_P (operands[0]) && MEM_P (operands[1]))" { switch (get_attr_type (insn)) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 185920) +++ config/i386/i386.c (working copy) @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ m_CORE2I7 | m_GENERIC, + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall + * on 16-bit immediate moves into memory on Core2 and Corei7, + * which may also affect AMD implementations. */ + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, + /* X86_TUNE_USE_HIMODE_FIOP */ m_386 | m_486 | m_K6_GEODE,