Message ID | 20121212163722.GA21037@atrey.karlin.mff.cuni.cz |
---|---|
State | New |
Headers | show |
On Wed, Dec 12, 2012 at 8:37 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> I noticed in prologue/epilogue, GCC prefers to use MOVs followed by a >> SP adjustment instead of a sequence of pushes/pops. The preference to >> the MOVs are good for old CPU micro-architectures (before pentium-4, >> K10), because it breaks the data dependency. In modern >> micro-architecture, push/pop is implemented using a mechanism called >> stack engine. The data dependency is removed by the hardware, and >> push/pop becomes very cheap (1 uOp, 1 cycle latency), and they are >> smaller. There is no longer the need to avoid using them. This is >> also what ICC does. >> >> The following patch fixed the problem. It passes bootstrap/regression >> test. OK to install? >> >> thanks, >> >> David >> >> Index: config/i386/i386.c >> =================================================================== >> --- config/i386/i386.c (revision 194324) >> +++ config/i386/i386.c (working copy) >> @@ -1919,10 +1919,10 @@ static unsigned int initial_ix86_tune_fe >> m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> >> /* X86_TUNE_PROLOGUE_USING_MOVE */ >> - m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC, >> + m_PPRO | m_ATHLON_K8, >> >> /* X86_TUNE_EPILOGUE_USING_MOVE */ >> - m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC, >> + m_PPRO | m_ATHLON_K8, > > Push/pops wrt moves was always difficult to tune on old CPUs, so I am happy it > is gone from generic (in fact I had similar patch pending). > Are you sure about Atom having stack engine, too? > Good question. The instruction latency table (http://www.agner.org/optimize/instruction_tables.pdf) shows that for Atom: push r has one 1uop, 1 cycle latency. However the instruction is not pairable which will affect ILP. The guide here http://www.agner.org/optimize/microarchitecture.pdf does not mention Atom has stack engine either. I will help collect some performance data on Atom. thanks, David > Related thing is accumulate_outgoing_args. Igor is testing it on Core and I will > give it a try on K10. > > Honza > > I am attaching the changes for core costs I made if someone is interested in > testing them. If we can declare P4/PPRo and maybe K8 chips obsolette for > generic, there is room for improvement in generic, too. Like using inc/dec > again. > > Honza > > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 194452) > +++ config/i386/i386.c (working copy) > @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { > COSTS_N_INSNS (8), /* cost of FABS instruction. */ > COSTS_N_INSNS (8), /* cost of FCHS instruction. */ > COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ > - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, > + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > {-1, libcall, false}}}}, > {{libcall, {{6, loop_1_byte, true}, > {24, loop, true}, > {8192, rep_prefix_4_byte, true}, > {-1, libcall, false}}}, > - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > {-1, libcall, false}}}}, > 1, /* scalar_stmt_cost. */ > 1, /* scalar load_cost. */ > @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe > m_PPRO, > > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > - m_CORE2I7 | m_GENERIC, > + m_GENERIC | m_CORE2, > > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe > m_K6, > > /* X86_TUNE_USE_CLTD */ > - ~(m_PENT | m_ATOM | m_K6), > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), > > /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ > m_PENT4, > @@ -1901,7 +1901,7 @@ static unsigned int initial_ix86_tune_fe > m_COREI7 | m_BDVER, > > /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */ > - m_BDVER , > + m_BDVER, > > /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies > are resolved on SSE register parts instead of whole registers, so we may > @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe > m_ATHLON_K8, > > /* X86_TUNE_SSE_TYPELESS_STORES */ > - m_AMD_MULTIPLE, > + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ > > /* X86_TUNE_SSE_LOAD0_BY_PXOR */ > - m_PPRO | m_P4_NOCONA, > + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ > > /* X86_TUNE_MEMORY_MISMATCH_STALL */ > m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > than 4 branch instructions in the 16 byte window. */ > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_SCHEDULE */ > m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, > @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe > m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_USE_INCDEC */ > - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), > + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), > > /* X86_TUNE_PAD_RETURNS */ > - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, > + m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ > m_ATOM, > @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe > m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, > > /* X86_TUNE_AVOID_VECTOR_DECODE */ > - m_CORE2I7 | m_K8 | m_GENERIC64, > + m_K8 | m_GENERIC64, > > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is > vector path on AMD machines. */ > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD > machines. */ > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR > than a MOV. */ > @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion > from FP to FP. */ > - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, > + m_AMDFAM10 | m_GENERIC, > > /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion > from integer to FP. */ > @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction > with a subsequent conditional jump instruction into a single > compare-and-branch uop. */ > - m_BDVER, > + m_BDVER | m_CORE2I7, > > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag > will impact LEA instruction selection. */ > @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe > }; > > static const unsigned int x86_accumulate_outgoing_args > - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; > + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > > static const unsigned int x86_arch_always_fancy_math_387 > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 194452) > +++ config/i386/i386.md (working copy) > @@ -965,7 +965,7 @@ > (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand") > (match_operand:SDWIM 2 "<general_operand>"))) > (set (pc) (if_then_else > - (match_operator 0 "ordered_comparison_operator" > + (match_operator 0 "comparison_operator" > [(reg:CC FLAGS_REG) (const_int 0)]) > (label_ref (match_operand 3)) > (pc)))] > @@ -983,7 +983,7 @@ > (compare:CC (match_operand:SWIM 2 "nonimmediate_operand") > (match_operand:SWIM 3 "<general_operand>"))) > (set (match_operand:QI 0 "register_operand") > - (match_operator 1 "ordered_comparison_operator" > + (match_operator 1 "comparison_operator" > [(reg:CC FLAGS_REG) (const_int 0)]))] > "" > {
Honza, can you explain each change and point to the reference? thanks, David On Wed, Dec 12, 2012 at 8:37 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> I noticed in prologue/epilogue, GCC prefers to use MOVs followed by a >> SP adjustment instead of a sequence of pushes/pops. The preference to >> the MOVs are good for old CPU micro-architectures (before pentium-4, >> K10), because it breaks the data dependency. In modern >> micro-architecture, push/pop is implemented using a mechanism called >> stack engine. The data dependency is removed by the hardware, and >> push/pop becomes very cheap (1 uOp, 1 cycle latency), and they are >> smaller. There is no longer the need to avoid using them. This is >> also what ICC does. >> >> The following patch fixed the problem. It passes bootstrap/regression >> test. OK to install? >> >> thanks, >> >> David >> >> Index: config/i386/i386.c >> =================================================================== >> --- config/i386/i386.c (revision 194324) >> +++ config/i386/i386.c (working copy) >> @@ -1919,10 +1919,10 @@ static unsigned int initial_ix86_tune_fe >> m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> >> /* X86_TUNE_PROLOGUE_USING_MOVE */ >> - m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC, >> + m_PPRO | m_ATHLON_K8, >> >> /* X86_TUNE_EPILOGUE_USING_MOVE */ >> - m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC, >> + m_PPRO | m_ATHLON_K8, > > Push/pops wrt moves was always difficult to tune on old CPUs, so I am happy it > is gone from generic (in fact I had similar patch pending). > Are you sure about Atom having stack engine, too? > > Related thing is accumulate_outgoing_args. Igor is testing it on Core and I will > give it a try on K10. > > Honza > > I am attaching the changes for core costs I made if someone is interested in > testing them. If we can declare P4/PPRo and maybe K8 chips obsolette for > generic, there is room for improvement in generic, too. Like using inc/dec > again. > > Honza > > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 194452) > +++ config/i386/i386.c (working copy) > @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { > COSTS_N_INSNS (8), /* cost of FABS instruction. */ > COSTS_N_INSNS (8), /* cost of FCHS instruction. */ > COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ > - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, > + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > {-1, libcall, false}}}}, > {{libcall, {{6, loop_1_byte, true}, > {24, loop, true}, > {8192, rep_prefix_4_byte, true}, > {-1, libcall, false}}}, > - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > {-1, libcall, false}}}}, > 1, /* scalar_stmt_cost. */ > 1, /* scalar load_cost. */ > @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe > m_PPRO, > > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > - m_CORE2I7 | m_GENERIC, > + m_GENERIC | m_CORE2, > > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe > m_K6, > > /* X86_TUNE_USE_CLTD */ > - ~(m_PENT | m_ATOM | m_K6), > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), > > /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ > m_PENT4, > @@ -1901,7 +1901,7 @@ static unsigned int initial_ix86_tune_fe > m_COREI7 | m_BDVER, > > /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */ > - m_BDVER , > + m_BDVER, > > /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies > are resolved on SSE register parts instead of whole registers, so we may > @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe > m_ATHLON_K8, > > /* X86_TUNE_SSE_TYPELESS_STORES */ > - m_AMD_MULTIPLE, > + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ > > /* X86_TUNE_SSE_LOAD0_BY_PXOR */ > - m_PPRO | m_P4_NOCONA, > + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ > > /* X86_TUNE_MEMORY_MISMATCH_STALL */ > m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > than 4 branch instructions in the 16 byte window. */ > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_SCHEDULE */ > m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, > @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe > m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_USE_INCDEC */ > - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), > + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), > > /* X86_TUNE_PAD_RETURNS */ > - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, > + m_AMD_MULTIPLE | m_GENERIC, > > /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ > m_ATOM, > @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe > m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, > > /* X86_TUNE_AVOID_VECTOR_DECODE */ > - m_CORE2I7 | m_K8 | m_GENERIC64, > + m_K8 | m_GENERIC64, > > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is > vector path on AMD machines. */ > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD > machines. */ > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR > than a MOV. */ > @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion > from FP to FP. */ > - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, > + m_AMDFAM10 | m_GENERIC, > > /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion > from integer to FP. */ > @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction > with a subsequent conditional jump instruction into a single > compare-and-branch uop. */ > - m_BDVER, > + m_BDVER | m_CORE2I7, > > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag > will impact LEA instruction selection. */ > @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe > }; > > static const unsigned int x86_accumulate_outgoing_args > - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; > + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > > static const unsigned int x86_arch_always_fancy_math_387 > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 194452) > +++ config/i386/i386.md (working copy) > @@ -965,7 +965,7 @@ > (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand") > (match_operand:SDWIM 2 "<general_operand>"))) > (set (pc) (if_then_else > - (match_operator 0 "ordered_comparison_operator" > + (match_operator 0 "comparison_operator" > [(reg:CC FLAGS_REG) (const_int 0)]) > (label_ref (match_operand 3)) > (pc)))] > @@ -983,7 +983,7 @@ > (compare:CC (match_operand:SWIM 2 "nonimmediate_operand") > (match_operand:SWIM 3 "<general_operand>"))) > (set (match_operand:QI 0 "register_operand") > - (match_operator 1 "ordered_comparison_operator" > + (match_operator 1 "comparison_operator" > [(reg:CC FLAGS_REG) (const_int 0)]))] > "" > {
Concerning 1push per cycle, I think it is same as K7 hardware did, so move prologue should be a win. > > Index: config/i386/i386.c > > =================================================================== > > --- config/i386/i386.c (revision 194452) > > +++ config/i386/i386.c (working copy) > > @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { > > COSTS_N_INSNS (8), /* cost of FABS instruction. */ > > COSTS_N_INSNS (8), /* cost of FCHS instruction. */ > > COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ > > - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > > - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, > > + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, > > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > > {-1, libcall, false}}}}, > > {{libcall, {{6, loop_1_byte, true}, > > {24, loop, true}, > > {8192, rep_prefix_4_byte, true}, > > {-1, libcall, false}}}, > > - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, > > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, libcall is not faster up to 8KB to rep sequence that is better for regalloc/code cache than fully blowin function call. > > @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe > > m_PPRO, > > > > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > > - m_CORE2I7 | m_GENERIC, > > + m_GENERIC | m_CORE2, This disable shifts that store just some flags. Acroding to Agner's manual I7 handle this well. Partial flags stall The Sandy Bridge uses the method of an extra Âľop to join partial registers not only for general purpose registers but also for the flags register, unlike previous processors which used this method only for general purpose registers. This occurs when a write to a part of the flags register is followed by a read from a larger part of the flags register. The partial flags stall of previous processors (See page 75) is therefore replaced by an extra Âľop. The Sandy Bridge also generates an extra Âľop when reading the flags after a rotate instruction. This is cheaper than the 7 cycle delay on Core this flags is trying to avoid. > > > > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe > > m_K6, > > > > /* X86_TUNE_USE_CLTD */ > > - ~(m_PENT | m_ATOM | m_K6), > > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), None of CPUs that generic care about are !USE_CLTD now after your change. > > @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe > > m_ATHLON_K8, > > > > /* X86_TUNE_SSE_TYPELESS_STORES */ > > - m_AMD_MULTIPLE, > > + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ Hmm, I can not seem to find this in manual now, but I believe that stores also do not type, so movaps is preferred over movapd store because it is shorter. If not, this change should produce a lot of slowdowns. > > > > /* X86_TUNE_SSE_LOAD0_BY_PXOR */ > > - m_PPRO | m_P4_NOCONA, > > + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ Agner: A common way of setting a register to zero is XOR EAX,EAX or SUB EBX,EBX. The Core2 and Nehalem processors recognize that certain instructions are independent of the prior value of the register if the source and destination registers are the same. This applies to all of the following instructions: XOR, SUB, PXOR, XORPS, XORPD, and all variants of PSUBxxx and PCMPxxx except PCMPEQQ. > > > > /* X86_TUNE_MEMORY_MISMATCH_STALL */ > > m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe > > > > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > > than 4 branch instructions in the 16 byte window. */ > > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, This is special passs to handle limitations of AMD's K7/K8/K10 branch prediction. Intel never had similar design, so this flag is pointless. We apparently ought to disable it for K10, at least per Agner's manual. > > > > /* X86_TUNE_SCHEDULE */ > > m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, > > @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe > > m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > > > /* X86_TUNE_USE_INCDEC */ > > - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), > > + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), Skipping inc/dec is to avoid partial flag stall happening on P4 only. > > > > /* X86_TUNE_PAD_RETURNS */ > > - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, > > + m_AMD_MULTIPLE | m_GENERIC, Again this deals specifically with AMD K7/K8/K10 branch prediction. I am not even sure this should be enabled for K10. > > > > /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ > > m_ATOM, > > @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe > > m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, > > > > /* X86_TUNE_AVOID_VECTOR_DECODE */ > > - m_CORE2I7 | m_K8 | m_GENERIC64, > > + m_K8 | m_GENERIC64, This avoid AMD vector decoded instructions, again if it helped it did so by accident. > > > > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode > > and SImode multiply, but 386 and 486 do HImode multiply faster. */ > > @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe > > > > /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is > > vector path on AMD machines. */ > > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > > > /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD > > machines. */ > > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, > > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, This is similarly targetd for AMD hardware only. I did not find ismilar limiation in the optimization manual. > > > > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR > > than a MOV. */ > > @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe > > > > /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion > > from FP to FP. */ > > - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, > > + m_AMDFAM10 | m_GENERIC, This is quite specific feature of AMD chips to preffer packed converts over scalar. Nothing like this is documented for cores > > > > /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion > > from integer to FP. */ > > @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe > > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction > > with a subsequent conditional jump instruction into a single > > compare-and-branch uop. */ > > - m_BDVER, > > + m_BDVER | m_CORE2I7, Core iplements fusion similar to what AMD does so I think this just applies here. > > > > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag > > will impact LEA instruction selection. */ > > @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe > > }; > > > > static const unsigned int x86_accumulate_outgoing_args > > - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; > > + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; Stack engine should make this cheap, just like prologues in moves. This definitely needs some validation, the accumulate-outgoing-args codegen differs quite a lot. Also this leads to unwind tables bloat. > > > > static const unsigned int x86_arch_always_fancy_math_387 > > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; Honza
Jan Hubicka <hubicka@ucw.cz> writes: > > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > cache than fully blowin function call. I noticed btw that some of the generated string instructions are slower than just calling the C library. rep scasb etc. is rarely a win over an optimized library function, it's not very optimized. Perhaps those patterns should just be disabled. The way to optimize that on modern CPUs is to use PCMP*STR*, but that's quite a bit more complicated and has some constraints. >> > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more >> > than 4 branch instructions in the 16 byte window. */ >> > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > This is special passs to handle limitations of AMD's K7/K8/K10 branch prediction. > Intel never had similar design, so this flag is pointless. Actually the Sandy Bridge decoded icache has a limit of 3 jumps per 16 byte window. If you exceed that it falls back to running the full decoder from the normal icache. I don't have solid data, but it may be a win for frontend limited code (otherwise possibly more in power than performance) I would revisit that for Sandy Bridge -Andi
> Jan Hubicka <hubicka@ucw.cz> writes: > > > > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > > cache than fully blowin function call. > > I noticed btw that some of the generated string instructions are slower > than just calling the C library. > > rep scasb etc. is rarely a win over an optimized library function, > it's not very optimized. Perhaps those patterns should just be disabled. > The way to optimize that on modern CPUs is to use PCMP*STR*, but that's > quite a bit more complicated and has some constraints. This is only about memset/memcpy expanding. The other sequences are quite lame indeed... > > > >> > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more > >> > than 4 branch instructions in the 16 byte window. */ > >> > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > >> > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > > > This is special passs to handle limitations of AMD's K7/K8/K10 branch prediction. > > Intel never had similar design, so this flag is pointless. > > Actually the Sandy Bridge decoded icache has a limit of 3 jumps per > 16 byte window. If you exceed that it falls back to running > the full decoder from the normal icache. > > I don't have solid data, but it may be a win for frontend limited > code (otherwise possibly more in power than performance) > > I would revisit that for Sandy Bridge We are not particularly good on avoiding the branches - basically the code inserts alignment whenever it things the 4 consecutive branches fit in the window. I can make patch to change this to 3 and we can see if it helps at all. > > -Andi > -- > ak@linux.intel.com -- Speaking for myself only
I'm working on OS-adaptations for an OS that would use x86-64 applications that are located above 4G, but not in the upper area. Binutils provide a function to be able to set the start of text to above 4G, but there are problems with GCC when using this memory model. The first issue has to do with creating a cross-compiler that defaults to medium memory model using PIC. While there are switches to achieve this on command line (-mcmodel=medium -fpic), this is inconvinient since everything ported must be changed to add these switches, including libgcc and newlib. The cross-compiler instead should default to this memory model. One possibility to achieve this is to add a new .h-file in the gcc/config/i386 directory. However, further inspection of the source indicates there is no macro that can be redefined to achieve this. One possibility is to add such a macro in gcc/config/i386.h, and implement it in gcc/config/i386.c. Here is a simple patch to do this: diff -u -r -N gcc-4.8-20121202/gcc/config/i386/i386.h gcc-work/gcc/config/i386/i386.h --- gcc-4.8-20121202/gcc/config/i386/i386.h 2012-11-23 17:02:10.000000000 +0100 +++ gcc-work/gcc/config/i386/i386.h 2012-12-08 12:17:40.000000000 +0100 @@ -86,6 +86,8 @@ #define TARGET_LP64 TARGET_ABI_64 #define TARGET_X32 TARGET_ABI_X32 +#define TARGET_MEDIUM_PIC 0 + /* SSE4.1 defines round instructions */ #define OPTION_MASK_ISA_ROUND OPTION_MASK_ISA_SSE4_1 #define TARGET_ISA_ROUND ((ix86_isa_flags & OPTION_MASK_ISA_ROUND) != 0) diff -u -r -N gcc-4.8-20121202/gcc/config/i386/i386.c gcc-work/gcc/config/i386/i386.c --- gcc-4.8-20121202/gcc/config/i386/i386.c 2012-12-02 00:43:52.000000000 +0100 +++ gcc-work/gcc/config/i386/i386.c 2012-12-11 21:43:48.000000000 +0100 @@ -3235,6 +3235,8 @@ DLL, and is essentially just as efficient as direct addressing. */ if (TARGET_64BIT && DEFAULT_ABI == MS_ABI) ix86_cmodel = CM_SMALL_PIC, flag_pic = 1; + else if (TARGET_64BIT && TARGET_MEDIUM_PIC) + ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1; else if (TARGET_64BIT) ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL; else It can be used like this: +#undef TARGET_MEDIUM_PIC +#define TARGET_MEDIUM_PIC 1 Next, with this issue fixed, there is still another problem in libgcc when using a cross-compiler compiled with this memory model: ../../gcc-work/libgcc/. -I../../../gcc-work/libgcc/../gcc -I../../../gcc-work/libgcc/../include -DHAVE_CC_TLS -o cpuinfo.o -MT cpuinfo.o -MD -MP -MF cpuinfo.dep -c ../../../gcc-work/libgcc/config/i386/cpuinfo.c -fvisibility=hidden -DHIDE_EXPORTS In file included from ../../../gcc-work/libgcc/config/i386/cpuinfo.c:21:0: ../../../gcc-work/libgcc/config/i386/cpuinfo.c: In function 'get_available_features': ../../../gcc-work/libgcc/config/i386/cpuinfo.c:236:7: error: inconsistent operand constraints in an 'asm' __cpuid_count (7, 0, eax, ebx, ecx, edx); ^ ../../../gcc-work/libgcc/static-object.mk:17: recipe for target `cpuinfo.o' failed make[1]: *** [cpuinfo.o] Error 1 make[1]: Lämnar katalogen "/usr/src/build-gcc-noheader/rdos/libgcc" Makefile:10619: recipe for target `all-target-libgcc' failed make: *** [all-target-libgcc] Error 2 My guess is that __cpuid_count uses 32-bit addressing when it should be using 64-bit addressing. I have no patch for this as I don't understand what is going on here well enough, but __cpuid_count is defined in gcc/config/i386/cpuid.h. In order to be able to continue to test the medium memory model I'd need patches to be applied to fix these issues. Regards, Leif Ekblad RDOS Development
On Wed, Dec 12, 2012 at 12:56 PM, Leif Ekblad <leif@rdos.net> wrote: > I'm working on OS-adaptations for an OS that would use x86-64 applications > that are located above 4G, but not in the upper area. Binutils provide a > function to be able to set the start of text to above 4G, but there are > problems with GCC when using this memory model. > Have you tried PIE with small model? You can place your binaries above 4G with better performance.
The small memory model will not do since I want to put data at other distinct addresses above 4G. I also want to place the heap at yet another address interval. This way it becomes easy to separate out code, data and heap references, and making sure that pointers are valid. The primary reason for not using below 2G or the last 2G, is because such numbers are formed naturally when doing 32-bit arithmetics, and thus could be executed by chance from corrupt data. If I understand it correctly, the PIE option is very similar to the PIC option, and will not make it possible to use any address for both code and data. Additionally, when I tried the small memory model with a start address of text above 4G, the linker complains about 32-bit fixups overflowing. Leif Ekblad ----- Original Message ----- From: "H.J. Lu" <hjl.tools@gmail.com> To: "Leif Ekblad" <leif@rdos.net> Cc: "GCC Patches" <gcc-patches@gcc.gnu.org>; "GCC Mailing List" <gcc@gcc.gnu.org> Sent: Wednesday, December 12, 2012 9:59 PM Subject: Re: x86-64 medium memory model > On Wed, Dec 12, 2012 at 12:56 PM, Leif Ekblad <leif@rdos.net> wrote: >> I'm working on OS-adaptations for an OS that would use x86-64 >> applications >> that are located above 4G, but not in the upper area. Binutils provide a >> function to be able to set the start of text to above 4G, but there are >> problems with GCC when using this memory model. >> > > Have you tried PIE with small model? You can place your > binaries above 4G with better performance. > > -- > H.J.
On Wed, Dec 12, 2012 at 10:30 AM, Jan Hubicka <hubicka@ucw.cz> wrote: > Concerning 1push per cycle, I think it is same as K7 hardware did, so move > prologue should be a win. >> > Index: config/i386/i386.c >> > =================================================================== >> > --- config/i386/i386.c (revision 194452) >> > +++ config/i386/i386.c (working copy) >> > @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { >> > COSTS_N_INSNS (8), /* cost of FABS instruction. */ >> > COSTS_N_INSNS (8), /* cost of FCHS instruction. */ >> > COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ >> > - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, >> > - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, >> > + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, >> > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, >> > {-1, libcall, false}}}}, >> > {{libcall, {{6, loop_1_byte, true}, >> > {24, loop, true}, >> > {8192, rep_prefix_4_byte, true}, >> > {-1, libcall, false}}}, >> > - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, >> > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, > > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > cache than fully blowin function call. Be careful with this. My recollection is that REP sequence is good for any size -- for smaller size, the REP initial set up cost is too high (10s of cycles), while for large size copy, it is less efficient compared with library version. >> > @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe >> > m_PPRO, >> > >> > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ >> > - m_CORE2I7 | m_GENERIC, >> > + m_GENERIC | m_CORE2, > > This disable shifts that store just some flags. Acroding to Agner's manual I7 handle > this well. > ok. > Partial flags stall > The Sandy Bridge uses the method of an extra Âľop to join partial registers not only for > general purpose registers but also for the flags register, unlike previous processors which > used this method only for general purpose registers. This occurs when a write to a part of > the flags register is followed by a read from a larger part of the flags register. The partial > flags stall of previous processors (See page 75) is therefore replaced by an extra Âľop. The > Sandy Bridge also generates an extra Âľop when reading the flags after a rotate instruction. > > This is cheaper than the 7 cycle delay on Core this flags is trying to avoid. ok. >> > >> > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall >> > * on 16-bit immediate moves into memory on Core2 and Corei7. */ >> > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe >> > m_K6, >> > >> > /* X86_TUNE_USE_CLTD */ >> > - ~(m_PENT | m_ATOM | m_K6), >> > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), My change was to enable CLTD for generic. Is your change intended to revert that? > > None of CPUs that generic care about are !USE_CLTD now after your change. >> > @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe >> > m_ATHLON_K8, >> > >> > /* X86_TUNE_SSE_TYPELESS_STORES */ >> > - m_AMD_MULTIPLE, >> > + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ > > Hmm, I can not seem to find this in manual now, but I believe that stores also do not type, > so movaps is preferred over movapd store because it is shorter. If not, this change should > produce a lot of slowdowns. >> > >> > /* X86_TUNE_SSE_LOAD0_BY_PXOR */ >> > - m_PPRO | m_P4_NOCONA, >> > + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ > > Agner: > A common way of setting a register to zero is XOR EAX,EAX or SUB EBX,EBX. The > Core2 and Nehalem processors recognize that certain instructions are independent of the > prior value of the register if the source and destination registers are the same. > > This applies to all of the following instructions: XOR, SUB, PXOR, XORPS, XORPD, and all > variants of PSUBxxx and PCMPxxx except PCMPEQQ. >> > >> > /* X86_TUNE_MEMORY_MISMATCH_STALL */ >> > m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> > @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe >> > >> > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more >> > than 4 branch instructions in the 16 byte window. */ >> > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, > > This is special passs to handle limitations of AMD's K7/K8/K10 branch prediction. > Intel never had similar design, so this flag is pointless. I noticed that too, but Andi has a better answer to it. > > We apparently ought to disable it for K10, at least per Agner's manual. >> > >> > /* X86_TUNE_SCHEDULE */ >> > m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, >> > @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe >> > m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> > >> > /* X86_TUNE_USE_INCDEC */ >> > - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), >> > + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), > > Skipping inc/dec is to avoid partial flag stall happening on P4 only. >> > K8 and K10 partitions the flags into groups. References to flags to the same group can still cause the stall -- not sure how that can be handled. >> > /* X86_TUNE_PAD_RETURNS */ >> > - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, >> > + m_AMD_MULTIPLE | m_GENERIC, > > Again this deals specifically with AMD K7/K8/K10 branch prediction. I am not even > sure this should be enabled for K10. >> > >> > /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ >> > m_ATOM, >> > @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe >> > m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, >> > >> > /* X86_TUNE_AVOID_VECTOR_DECODE */ >> > - m_CORE2I7 | m_K8 | m_GENERIC64, >> > + m_K8 | m_GENERIC64, > > This avoid AMD vector decoded instructions, again if it helped it did so by accident. >> > >> > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode >> > and SImode multiply, but 386 and 486 do HImode multiply faster. */ >> > @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe >> > >> > /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is >> > vector path on AMD machines. */ >> > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, >> > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, >> > >> > /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD >> > machines. */ >> > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, >> > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, > > This is similarly targetd for AMD hardware only. I did not find ismilar limiation > in the optimization manual. >> > >> > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR >> > than a MOV. */ >> > @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe >> > >> > /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion >> > from FP to FP. */ >> > - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, >> > + m_AMDFAM10 | m_GENERIC, > > This is quite specific feature of AMD chips to preffer packed converts over > scalar. Nothing like this is documented for cores >> > >> > /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion >> > from integer to FP. */ >> > @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe >> > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction >> > with a subsequent conditional jump instruction into a single >> > compare-and-branch uop. */ >> > - m_BDVER, >> > + m_BDVER | m_CORE2I7, > > Core iplements fusion similar to what AMD does so I think this just applies here. yes. thanks, David >> > >> > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag >> > will impact LEA instruction selection. */ >> > @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe >> > }; >> > >> > static const unsigned int x86_accumulate_outgoing_args >> > - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; >> > + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > > Stack engine should make this cheap, just like prologues in moves. > This definitely needs some validation, the accumulate-outgoing-args > codegen differs quite a lot. Also this leads to unwind tables bloat. >> > >> > static const unsigned int x86_arch_always_fancy_math_387 >> > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; > > Honza
On Wed, Dec 12, 2012 at 4:16 PM, Xinliang David Li <davidxl@google.com> wrote: > On Wed, Dec 12, 2012 at 10:30 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> Concerning 1push per cycle, I think it is same as K7 hardware did, so move >> prologue should be a win. >>> > Index: config/i386/i386.c >>> > =================================================================== >>> > --- config/i386/i386.c (revision 194452) >>> > +++ config/i386/i386.c (working copy) >>> > @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { >>> > COSTS_N_INSNS (8), /* cost of FABS instruction. */ >>> > COSTS_N_INSNS (8), /* cost of FCHS instruction. */ >>> > COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ >>> > - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, >>> > - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, >>> > + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, >>> > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, >>> > {-1, libcall, false}}}}, >>> > {{libcall, {{6, loop_1_byte, true}, >>> > {24, loop, true}, >>> > {8192, rep_prefix_4_byte, true}, >>> > {-1, libcall, false}}}, >>> > - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, >>> > + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, >> >> libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> cache than fully blowin function call. > > Be careful with this. My recollection is that REP sequence is good for s/good/not good/ David > any size -- for smaller size, the REP initial set up cost is too high > (10s of cycles), while for large size copy, it is less efficient > compared with library version. > > >>> > @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe >>> > m_PPRO, >>> > >>> > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ >>> > - m_CORE2I7 | m_GENERIC, >>> > + m_GENERIC | m_CORE2, >> >> This disable shifts that store just some flags. Acroding to Agner's manual I7 handle >> this well. >> > > ok. > >> Partial flags stall >> The Sandy Bridge uses the method of an extra Âľop to join partial registers not only for >> general purpose registers but also for the flags register, unlike previous processors which >> used this method only for general purpose registers. This occurs when a write to a part of >> the flags register is followed by a read from a larger part of the flags register. The partial >> flags stall of previous processors (See page 75) is therefore replaced by an extra Âľop. The >> Sandy Bridge also generates an extra Âľop when reading the flags after a rotate instruction. >> >> This is cheaper than the 7 cycle delay on Core this flags is trying to avoid. > > ok. > >>> > >>> > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall >>> > * on 16-bit immediate moves into memory on Core2 and Corei7. */ >>> > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe >>> > m_K6, >>> > >>> > /* X86_TUNE_USE_CLTD */ >>> > - ~(m_PENT | m_ATOM | m_K6), >>> > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), > > My change was to enable CLTD for generic. Is your change intended to > revert that? > >> >> None of CPUs that generic care about are !USE_CLTD now after your change. >>> > @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe >>> > m_ATHLON_K8, >>> > >>> > /* X86_TUNE_SSE_TYPELESS_STORES */ >>> > - m_AMD_MULTIPLE, >>> > + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ >> >> Hmm, I can not seem to find this in manual now, but I believe that stores also do not type, >> so movaps is preferred over movapd store because it is shorter. If not, this change should >> produce a lot of slowdowns. >>> > >>> > /* X86_TUNE_SSE_LOAD0_BY_PXOR */ >>> > - m_PPRO | m_P4_NOCONA, >>> > + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ >> >> Agner: >> A common way of setting a register to zero is XOR EAX,EAX or SUB EBX,EBX. The >> Core2 and Nehalem processors recognize that certain instructions are independent of the >> prior value of the register if the source and destination registers are the same. >> >> This applies to all of the following instructions: XOR, SUB, PXOR, XORPS, XORPD, and all >> variants of PSUBxxx and PCMPxxx except PCMPEQQ. >>> > >>> > /* X86_TUNE_MEMORY_MISMATCH_STALL */ >>> > m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >>> > @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe >>> > >>> > /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more >>> > than 4 branch instructions in the 16 byte window. */ >>> > - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >>> > + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >> >> This is special passs to handle limitations of AMD's K7/K8/K10 branch prediction. >> Intel never had similar design, so this flag is pointless. > > I noticed that too, but Andi has a better answer to it. > >> >> We apparently ought to disable it for K10, at least per Agner's manual. >>> > >>> > /* X86_TUNE_SCHEDULE */ >>> > m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, >>> > @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe >>> > m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, >>> > >>> > /* X86_TUNE_USE_INCDEC */ >>> > - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), >>> > + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), >> >> Skipping inc/dec is to avoid partial flag stall happening on P4 only. >>> > > > > K8 and K10 partitions the flags into groups. References to flags to > the same group can still cause the stall -- not sure how that can be > handled. > >>> > /* X86_TUNE_PAD_RETURNS */ >>> > - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, >>> > + m_AMD_MULTIPLE | m_GENERIC, >> >> Again this deals specifically with AMD K7/K8/K10 branch prediction. I am not even >> sure this should be enabled for K10. >>> > >>> > /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ >>> > m_ATOM, >>> > @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe >>> > m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, >>> > >>> > /* X86_TUNE_AVOID_VECTOR_DECODE */ >>> > - m_CORE2I7 | m_K8 | m_GENERIC64, >>> > + m_K8 | m_GENERIC64, >> >> This avoid AMD vector decoded instructions, again if it helped it did so by accident. >>> > >>> > /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode >>> > and SImode multiply, but 386 and 486 do HImode multiply faster. */ >>> > @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe >>> > >>> > /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is >>> > vector path on AMD machines. */ >>> > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, >>> > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, >>> > >>> > /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD >>> > machines. */ >>> > - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, >>> > + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, >> >> This is similarly targetd for AMD hardware only. I did not find ismilar limiation >> in the optimization manual. >>> > >>> > /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR >>> > than a MOV. */ >>> > @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe >>> > >>> > /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion >>> > from FP to FP. */ >>> > - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, >>> > + m_AMDFAM10 | m_GENERIC, >> >> This is quite specific feature of AMD chips to preffer packed converts over >> scalar. Nothing like this is documented for cores >>> > >>> > /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion >>> > from integer to FP. */ >>> > @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe >>> > /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction >>> > with a subsequent conditional jump instruction into a single >>> > compare-and-branch uop. */ >>> > - m_BDVER, >>> > + m_BDVER | m_CORE2I7, >> >> Core iplements fusion similar to what AMD does so I think this just applies here. > > yes. > > > thanks, > > David > > >>> > >>> > /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag >>> > will impact LEA instruction selection. */ >>> > @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe >>> > }; >>> > >>> > static const unsigned int x86_accumulate_outgoing_args >>> > - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; >>> > + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; >> >> Stack engine should make this cheap, just like prologues in moves. >> This definitely needs some validation, the accumulate-outgoing-args >> codegen differs quite a lot. Also this leads to unwind tables bloat. >>> > >>> > static const unsigned int x86_arch_always_fancy_math_387 >>> > = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; >> >> Honza
> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > > cache than fully blowin function call. > > Be careful with this. My recollection is that REP sequence is good for > any size -- for smaller size, the REP initial set up cost is too high > (10s of cycles), while for large size copy, it is less efficient > compared with library version. Well this is based on the data from the memtest script. Core has good REP implementation - it is a win from rather small blocks (16 bytes if I recall) and it does not need alignment. Library version starts to be interesting with caching hints, but I think till 80KB it is still not a win for my setup (glibc-2.15) > >> > > >> > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > >> > * on 16-bit immediate moves into memory on Core2 and Corei7. */ > >> > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe > >> > m_K6, > >> > > >> > /* X86_TUNE_USE_CLTD */ > >> > - ~(m_PENT | m_ATOM | m_K6), > >> > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), > > My change was to enable CLTD for generic. Is your change intended to > revert that? No, it is merge conflict, sorry. I will update it in my tree. > > Skipping inc/dec is to avoid partial flag stall happening on P4 only. > >> > > > > K8 and K10 partitions the flags into groups. References to flags to > the same group can still cause the stall -- not sure how that can be > handled. I belive the stalls happends only in quite special cases where compare instruction combines flags from multiple instructions. GCC don't generate this type of code, so we should be safe. Honza
On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> > cache than fully blowin function call. >> >> Be careful with this. My recollection is that REP sequence is good for >> any size -- for smaller size, the REP initial set up cost is too high >> (10s of cycles), while for large size copy, it is less efficient >> compared with library version. > > Well this is based on the data from the memtest script. > Core has good REP implementation - it is a win from rather small blocks (16 > bytes if I recall) and it does not need alignment. > Library version starts to be interesting with caching hints, but I think till 80KB > it is still not a win for my setup (glibc-2.15) A simple test shows that -mstringop-strategy=libcall always beats -mstringop-strategy=rep_8byte (on core2 and corei7) except for size smaller than 8 where the rep_8byte strategy simply bypasses REP movs. Can you share your memtest ? thanks, David >> >> > >> >> > /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall >> >> > * on 16-bit immediate moves into memory on Core2 and Corei7. */ >> >> > @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe >> >> > m_K6, >> >> > >> >> > /* X86_TUNE_USE_CLTD */ >> >> > - ~(m_PENT | m_ATOM | m_K6), >> >> > + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), >> >> My change was to enable CLTD for generic. Is your change intended to >> revert that? > > No, it is merge conflict, sorry. I will update it in my tree. >> > Skipping inc/dec is to avoid partial flag stall happening on P4 only. >> >> > >> >> >> K8 and K10 partitions the flags into groups. References to flags to >> the same group can still cause the stall -- not sure how that can be >> handled. > > I belive the stalls happends only in quite special cases where compare instruction > combines flags from multiple instructions. GCC don't generate this type of code, so > we should be safe. > > Honza
On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: > On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: > >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > >> > cache than fully blowin function call. > >> > >> Be careful with this. My recollection is that REP sequence is good for > >> any size -- for smaller size, the REP initial set up cost is too high > >> (10s of cycles), while for large size copy, it is less efficient > >> compared with library version. > > > > Well this is based on the data from the memtest script. > > Core has good REP implementation - it is a win from rather small blocks (16 > > bytes if I recall) and it does not need alignment. > > Library version starts to be interesting with caching hints, but I think till 80KB > > it is still not a win for my setup (glibc-2.15) > > A simple test shows that -mstringop-strategy=libcall always beats > -mstringop-strategy=rep_8byte (on core2 and corei7) except for size > smaller than 8 where the rep_8byte strategy simply bypasses REP movs. > Can you share your memtest ? I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a libcall. The PLT call overhead is simply too high. Jakub
Try the following one. 1) -minline-all-stringops -mstringop-strategy=rep_8byte -O2 vs 1) -mstringop_strategy=libcall -O2. David #include <string.h> #include <stdio.h> #include <stdlib.h> #ifndef LEN #define LEN 16 #endif void copy(char* s1, char* s2,int len) __attribute__((noinline)); void copy(char* s1, char* s2,int len) { memcpy(s2,s1,len); } int main() { char* s1 = (char*) malloc(LEN +10); char* s2 = (char*) malloc(LEN +10); int i = 0; for (i = 0; i < 1000000000; i++) { copy(s1+1,s2+3,LEN); } } On Wed, Dec 12, 2012 at 10:21 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: >> On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> >> > cache than fully blowin function call. >> >> >> >> Be careful with this. My recollection is that REP sequence is good for >> >> any size -- for smaller size, the REP initial set up cost is too high >> >> (10s of cycles), while for large size copy, it is less efficient >> >> compared with library version. >> > >> > Well this is based on the data from the memtest script. >> > Core has good REP implementation - it is a win from rather small blocks (16 >> > bytes if I recall) and it does not need alignment. >> > Library version starts to be interesting with caching hints, but I think till 80KB >> > it is still not a win for my setup (glibc-2.15) >> >> A simple test shows that -mstringop-strategy=libcall always beats >> -mstringop-strategy=rep_8byte (on core2 and corei7) except for size >> smaller than 8 where the rep_8byte strategy simply bypasses REP movs. >> Can you share your memtest ? > > I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a > libcall. The PLT call overhead is simply too high. > > Jakub
On Thu, Dec 13, 2012 at 7:21 AM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: >> On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> >> > cache than fully blowin function call. >> >> >> >> Be careful with this. My recollection is that REP sequence is good for >> >> any size -- for smaller size, the REP initial set up cost is too high >> >> (10s of cycles), while for large size copy, it is less efficient >> >> compared with library version. >> > >> > Well this is based on the data from the memtest script. >> > Core has good REP implementation - it is a win from rather small blocks (16 >> > bytes if I recall) and it does not need alignment. >> > Library version starts to be interesting with caching hints, but I think till 80KB >> > it is still not a win for my setup (glibc-2.15) >> >> A simple test shows that -mstringop-strategy=libcall always beats >> -mstringop-strategy=rep_8byte (on core2 and corei7) except for size >> smaller than 8 where the rep_8byte strategy simply bypasses REP movs. >> Can you share your memtest ? > > I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a > libcall. The PLT call overhead is simply too high. I believe the PLT call overhead may be effectively zero if the benchmarking is just a loop around a memcpy. Thus for measuring the PLT overhead I call the benchmark broken ;) Richard. > Jakub
> Try the following one. 1) -minline-all-stringops > -mstringop-strategy=rep_8byte -O2 vs 1) -mstringop_strategy=libcall > -O2. > > David > > > #include <string.h> > #include <stdio.h> > #include <stdlib.h> > #ifndef LEN > #define LEN 16 > #endif > > void copy(char* s1, char* s2,int len) __attribute__((noinline)); > void copy(char* s1, char* s2,int len) > { > memcpy(s2,s1,len); > } I guess the catch here is that you force the copy to be noinline and thus you eliminate the benefits of inlined sequence. With inline stringop one saves regalloc and often can get rid of the alignment tests. This is script I use to tune the tables. Honza test() { rm -f a.out cat <<END | $1 -x c -O3 $3 -DAVG_SIZE=$2 $STRINGOP -DMEMORY_COPIES=$memsize - #define BUFFER_SIZE (16*1024*1024 + AVG_SIZE*2) /*#define MEMORY_COPIES (1024*1024*64*(long long)10)*/ $type t[BUFFER_SIZE]; main() { unsigned int i; for (i=0;i<((long long)MEMORY_COPIES + AVG_SIZE * 2 - 1)/AVG_SIZE*2;i++) #ifdef test_memset __builtin_memset (t+(i*1024*1024+i*1)%(BUFFER_SIZE - AVG_SIZE*2), i, (AVG_SIZE + i) % (AVG_SIZE * 2 + 0)); #else __builtin_memcpy (t+(i*1024*1024+i*1)%(BUFFER_SIZE - AVG_SIZE*2), t+((i+1)*1024*1024*4+i*1)%(BUFFER_SIZE - AVG_SIZE *2), (AVG_SIZE + i) % (AVG_SIZE * 2 + 0)); #endif return 0; } END TIME=`/usr/bin/time -f "%E" ./a.out 2>&1` echo -n " "$TIME echo $TIME $4 >>/tmp/accum } testrow() { echo -n "" >/tmp/accum printf "block size %7i" $3 test "$2" "$3" "-mstringop-strategy=libcall" libcall test "$2" "$3" "-mstringop-strategy=rep_byte -malign-stringops" rep1 test "$2" "$3" "-mstringop-strategy=rep_byte -mno-align-stringops" rep1noalign test "$2" "$3" "-mstringop-strategy=rep_4byte -malign-stringops" rep4 test "$2" "$3" "-mstringop-strategy=rep_4byte -mno-align-stringops" rep4noalign if [ "$mode" == 64 ] then test "$2" "$3" "-mstringop-strategy=rep_8byte -malign-stringops" rep8 test "$2" "$3" "-mstringop-strategy=rep_8byte -mno-align-stringops" rep8noalign fi test "$2" "$3" "-mstringop-strategy=loop -malign-stringops" loop test "$2" "$3" "-mstringop-strategy=loop -mno-align-stringops" loopnoalign test "$2" "$3" "-mstringop-strategy=unrolled_loop -malign-stringops" unrl test "$2" "$3" "-mstringop-strategy=unrolled_loop -mno-align-stringops" unrlnoalign test "$2" "$3" "-mstringop-strategy=sse_loop -malign-stringops" sse test "$2" "$3" "-mstringop-strategy=sse_loop -mno-align-stringops -msse2" ssenoalign test "$2" "$3" "-mstringop-strategy=byte_loop" byte best=`cat /tmp/accum | sort | head -1` test "$2" "$3" " -fprofile-generate" >/dev/null 2>&1 test "$2" "$3" " -fprofile-use" test "$2" "$3" " -minline-stringops-dynamically" echo " best: $best" } test_all_sizes() { if [ "$mode" == 64 ] then echo " libcall rep1 noalg rep4 noalg rep8 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" else echo " libcall rep1 noalg rep4 noalg loop noalg unrl noalg sse noalg byte profiled dynamic" fi #for size in 1 2 3 4 6 8 10 12 14 16 24 32 48 64 128 256 512 1024 4096 8192 81920 819200 8192000 #for size in 8192000 819200 81920 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 5 4 3 2 1 for size in 8192000 819200 81920 20480 8192 4096 2048 1024 512 256 128 64 48 32 24 16 14 12 10 8 6 4 1 #for size in 128 256 1024 4096 8192 81920 819200 do testrow "$1" "$2" $size done } mode=$1 shift export memsize=$1 shift cmdline=$* if [ "$mode" != 32 ] then if [ "$mode" != 64 ] then echo "Usage:" echo "test_stringop mode size cmdline" echo "mode is either 32 or 64" echo "size is amount of memory copied in each test. Should be chosed small enough so runtime is less than minute for each test and sorting works" echo "Example: test_stringop 32 640000000 ./xgcc -B ./ -march=pentium3" exit fi fi echo "memcpy mode:$mode size:$memsize" export STRINGOP="" type=char test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode" echo "memset" type=char export STRINGOP="-Dtest_memset=1" test_all_sizes $mode "$cmdline -m$mode" echo "Aligned" type=long test_all_sizes $mode "$cmdline -m$mode"
On Wed, Dec 12, 2012 at 10:21 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: >> On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> >> > cache than fully blowin function call. >> >> >> >> Be careful with this. My recollection is that REP sequence is good for >> >> any size -- for smaller size, the REP initial set up cost is too high >> >> (10s of cycles), while for large size copy, it is less efficient >> >> compared with library version. >> > >> > Well this is based on the data from the memtest script. >> > Core has good REP implementation - it is a win from rather small blocks (16 >> > bytes if I recall) and it does not need alignment. >> > Library version starts to be interesting with caching hints, but I think till 80KB >> > it is still not a win for my setup (glibc-2.15) >> >> A simple test shows that -mstringop-strategy=libcall always beats >> -mstringop-strategy=rep_8byte (on core2 and corei7) except for size >> smaller than 8 where the rep_8byte strategy simply bypasses REP movs. >> Can you share your memtest ? > > I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a > libcall. The PLT call overhead is simply too high. > The x86 string/memory functions in the current glibc are extremely fast and tuned for Core 2/Core i7. GCC is having a very hard time to beat them with inlining: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
> On Wed, Dec 12, 2012 at 10:21 PM, Jakub Jelinek <jakub@redhat.com> wrote: > > On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: > >> On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: > >> >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code > >> >> > cache than fully blowin function call. > >> >> > >> >> Be careful with this. My recollection is that REP sequence is good for > >> >> any size -- for smaller size, the REP initial set up cost is too high > >> >> (10s of cycles), while for large size copy, it is less efficient > >> >> compared with library version. > >> > > >> > Well this is based on the data from the memtest script. > >> > Core has good REP implementation - it is a win from rather small blocks (16 > >> > bytes if I recall) and it does not need alignment. > >> > Library version starts to be interesting with caching hints, but I think till 80KB > >> > it is still not a win for my setup (glibc-2.15) > >> > >> A simple test shows that -mstringop-strategy=libcall always beats > >> -mstringop-strategy=rep_8byte (on core2 and corei7) except for size > >> smaller than 8 where the rep_8byte strategy simply bypasses REP movs. > >> Can you share your memtest ? > > > > I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a > > libcall. The PLT call overhead is simply too high. > > > > The x86 string/memory functions in the current glibc are > extremely fast and tuned for Core 2/Core i7. GCC is having > a very hard time to beat them with inlining: > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 Here we speak about memcpy/memset only. I never got around to modernize strlen and friends, unfortunately... memcmp and friends are different beats. They realy need some TLC... Honza
On Thu, Dec 13, 2012 at 12:26 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> On Wed, Dec 12, 2012 at 10:21 PM, Jakub Jelinek <jakub@redhat.com> wrote: >> > On Wed, Dec 12, 2012 at 10:09:14PM -0800, Xinliang David Li wrote: >> >> On Wed, Dec 12, 2012 at 5:19 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >> >> > libcall is not faster up to 8KB to rep sequence that is better for regalloc/code >> >> >> > cache than fully blowin function call. >> >> >> >> >> >> Be careful with this. My recollection is that REP sequence is good for >> >> >> any size -- for smaller size, the REP initial set up cost is too high >> >> >> (10s of cycles), while for large size copy, it is less efficient >> >> >> compared with library version. >> >> > >> >> > Well this is based on the data from the memtest script. >> >> > Core has good REP implementation - it is a win from rather small blocks (16 >> >> > bytes if I recall) and it does not need alignment. >> >> > Library version starts to be interesting with caching hints, but I think till 80KB >> >> > it is still not a win for my setup (glibc-2.15) >> >> >> >> A simple test shows that -mstringop-strategy=libcall always beats >> >> -mstringop-strategy=rep_8byte (on core2 and corei7) except for size >> >> smaller than 8 where the rep_8byte strategy simply bypasses REP movs. >> >> Can you share your memtest ? >> > >> > I can't believe that say 16 byte or 32 byte memcpy can be ever faster using a >> > libcall. The PLT call overhead is simply too high. >> > >> >> The x86 string/memory functions in the current glibc are >> extremely fast and tuned for Core 2/Core i7. GCC is having >> a very hard time to beat them with inlining: >> >> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 > > Here we speak about memcpy/memset only. I never got around to modernize > strlen and friends, unfortunately... > > memcmp and friends are different beats. They realy need some TLC... memcpy and memset in glibc are also extremely fast.
> > Here we speak about memcpy/memset only. I never got around to modernize > > strlen and friends, unfortunately... > > > > memcmp and friends are different beats. They realy need some TLC... > > memcpy and memset in glibc are also extremely fast. The default strategy now is to inline only when the block is known to be small (either constant or via profile feedback, we do not really use the info on upper bound of size of the copied object that would be useful but not readilly available at expansion time). You can try the test_stringop script I attached and send me the results. For me libc starts to be win only for rather large blocks (i.e. >8KB) Honza
On Thu, Dec 13, 2012 at 12:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > Here we speak about memcpy/memset only. I never got around to modernize >> > strlen and friends, unfortunately... >> > >> > memcmp and friends are different beats. They realy need some TLC... >> >> memcpy and memset in glibc are also extremely fast. > > The default strategy now is to inline only when the block is known to be small > (either constant or via profile feedback, we do not really use the info on > upper bound of size of the copied object that would be useful but not readilly > available at expansion time). > > You can try the test_stringop script I attached and send me the results. For Areg, can you give it a try? Thanks. > me libc starts to be win only for rather large blocks (i.e. >8KB) > Which glibc are you using?
> > me libc starts to be win only for rather large blocks (i.e. >8KB) > > > > Which glibc are you using? 2.15 as it comes with opensuse 12.2 Honza > > -- > H.J.
We checked, no significant gains or losses. -----Original Message----- From: H.J. Lu [mailto:hjl.tools@gmail.com] Sent: Friday, December 14, 2012 1:03 AM To: Jan Hubicka Cc: Jakub Jelinek; Xinliang David Li; GCC Patches; Teresa Johnson; Melik-adamyan, Areg Subject: Re: [PATCH i386]: Enable push/pop in pro/epilogue for modern CPUs On Thu, Dec 13, 2012 at 12:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > Here we speak about memcpy/memset only. I never got around to >> > modernize strlen and friends, unfortunately... >> > >> > memcmp and friends are different beats. They realy need some TLC... >> >> memcpy and memset in glibc are also extremely fast. > > The default strategy now is to inline only when the block is known to > be small (either constant or via profile feedback, we do not really > use the info on upper bound of size of the copied object that would be > useful but not readilly available at expansion time). > > You can try the test_stringop script I attached and send me the > results. For Areg, can you give it a try? Thanks. > me libc starts to be win only for rather large blocks (i.e. >8KB) > Which glibc are you using? -- H.J.
On Thu, Dec 20, 2012 at 4:13 AM, Melik-adamyan, Areg <areg.melik-adamyan@intel.com> wrote: > We checked, no significant gains or losses. > > -----Original Message----- > From: H.J. Lu [mailto:hjl.tools@gmail.com] > Sent: Friday, December 14, 2012 1:03 AM > To: Jan Hubicka > Cc: Jakub Jelinek; Xinliang David Li; GCC Patches; Teresa Johnson; Melik-adamyan, Areg > Subject: Re: [PATCH i386]: Enable push/pop in pro/epilogue for modern CPUs > > On Thu, Dec 13, 2012 at 12:40 PM, Jan Hubicka <hubicka@ucw.cz> wrote: >>> > Here we speak about memcpy/memset only. I never got around to >>> > modernize strlen and friends, unfortunately... >>> > >>> > memcmp and friends are different beats. They realy need some TLC... >>> >>> memcpy and memset in glibc are also extremely fast. >> >> The default strategy now is to inline only when the block is known to >> be small (either constant or via profile feedback, we do not really >> use the info on upper bound of size of the copied object that would be >> useful but not readilly available at expansion time). >> >> You can try the test_stringop script I attached and send me the >> results. For > > Areg, can you give it a try? Thanks. > Hi Areg, Did you mean inlined memcpy/memset are as fast as the ones in libc.so on both ia32 and Intel64? Please keep in mind that memcpy/memset in libc.a may not be optimized. You must not use -static for linking.
> Hi Areg, > > Did you mean inlined memcpy/memset are as fast as > the ones in libc.so on both ia32 and Intel64? I would be interested in output of the stringop script. > > Please keep in mind that memcpy/memset in libc.a > may not be optimized. You must not use -static for > linking. In my setup I use dynamic linking... (this is quite anoying property in general - people tend to use --static for performance critical binaries to save expenses of PIC. It would be really cool to have way to call proper stringops based on -march switch....) Honza > > -- > H.J.
> > Hi Areg, > > > > Did you mean inlined memcpy/memset are as fast as > > the ones in libc.so on both ia32 and Intel64? > > I would be interested in output of the stringop script. Also as far as I can remember, none of spec2k6 benchmarks is really stringop bound. On Spec2k GCC was quite bound by memset (within alloc_rtx and bitmap oprations) but mostly by collecting page faults there. Inlining that one made quite a lot of difference on K8 hardware, but not on later chips. Honza
On Thu, Dec 20, 2012 at 7:06 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> > Hi Areg, >> > >> > Did you mean inlined memcpy/memset are as fast as >> > the ones in libc.so on both ia32 and Intel64? >> >> I would be interested in output of the stringop script. > > Also as far as I can remember, none of spec2k6 benchmarks is really stringop > bound. On Spec2k GCC was quite bound by memset (within alloc_rtx and bitmap > oprations) but mostly by collecting page faults there. Inlining that one made > quite a lot of difference on K8 hardware, but not on later chips. > There is a GCC performance regression bug on EEMBC. It turns out that -static was used for linking and optimized memory functions weren't used. Remove -static fixed the performance regression.
Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 194452) +++ config/i386/i386.c (working copy) @@ -1620,14 +1620,14 @@ struct processor_costs core_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ - {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true}, + {{libcall, {{8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, {-1, libcall, false}}}}, {{libcall, {{6, loop_1_byte, true}, {24, loop, true}, {8192, rep_prefix_4_byte, true}, {-1, libcall, false}}}, - {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true}, + {libcall, {{24, loop, true}, {8192, rep_prefix_8_byte, true}, {-1, libcall, false}}}}, 1, /* scalar_stmt_cost. */ 1, /* scalar load_cost. */ @@ -1806,7 +1806,7 @@ static unsigned int initial_ix86_tune_fe m_PPRO, /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ - m_CORE2I7 | m_GENERIC, + m_GENERIC | m_CORE2, /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall * on 16-bit immediate moves into memory on Core2 and Corei7. */ @@ -1822,7 +1822,7 @@ static unsigned int initial_ix86_tune_fe m_K6, /* X86_TUNE_USE_CLTD */ - ~(m_PENT | m_ATOM | m_K6), + ~(m_PENT | m_ATOM | m_K6 | m_GENERIC), /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ m_PENT4, @@ -1901,7 +1901,7 @@ static unsigned int initial_ix86_tune_fe m_COREI7 | m_BDVER, /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */ - m_BDVER , + m_BDVER, /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies are resolved on SSE register parts instead of whole registers, so we may @@ -1910,10 +1910,10 @@ static unsigned int initial_ix86_tune_fe m_ATHLON_K8, /* X86_TUNE_SSE_TYPELESS_STORES */ - m_AMD_MULTIPLE, + m_AMD_MULTIPLE | m_CORE2I7, /*????*/ /* X86_TUNE_SSE_LOAD0_BY_PXOR */ - m_PPRO | m_P4_NOCONA, + m_PPRO | m_P4_NOCONA | m_CORE2I7, /*????*/ /* X86_TUNE_MEMORY_MISMATCH_STALL */ m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, @@ -1938,7 +1938,7 @@ static unsigned int initial_ix86_tune_fe /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_SCHEDULE */ m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, @@ -1947,10 +1947,10 @@ static unsigned int initial_ix86_tune_fe m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_USE_INCDEC */ - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), + ~(m_P4_NOCONA | m_ATOM | m_GENERIC), /* X86_TUNE_PAD_RETURNS */ - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, + m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ m_ATOM, @@ -1959,7 +1959,7 @@ static unsigned int initial_ix86_tune_fe m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, /* X86_TUNE_AVOID_VECTOR_DECODE */ - m_CORE2I7 | m_K8 | m_GENERIC64, + m_K8 | m_GENERIC64, /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode and SImode multiply, but 386 and 486 do HImode multiply faster. */ @@ -1967,11 +1967,11 @@ static unsigned int initial_ix86_tune_fe /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is vector path on AMD machines. */ - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD machines. */ - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, + m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER, /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR than a MOV. */ @@ -1988,7 +1988,7 @@ static unsigned int initial_ix86_tune_fe /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion from FP to FP. */ - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, + m_AMDFAM10 | m_GENERIC, /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion from integer to FP. */ @@ -1997,7 +1997,7 @@ static unsigned int initial_ix86_tune_fe /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction with a subsequent conditional jump instruction into a single compare-and-branch uop. */ - m_BDVER, + m_BDVER | m_CORE2I7, /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag will impact LEA instruction selection. */ @@ -2052,7 +2052,7 @@ static unsigned int initial_ix86_arch_fe }; static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; + = m_PPRO | m_P4_NOCONA | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; static const unsigned int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 194452) +++ config/i386/i386.md (working copy) @@ -965,7 +965,7 @@ (compare:CC (match_operand:SDWIM 1 "nonimmediate_operand") (match_operand:SDWIM 2 "<general_operand>"))) (set (pc) (if_then_else - (match_operator 0 "ordered_comparison_operator" + (match_operator 0 "comparison_operator" [(reg:CC FLAGS_REG) (const_int 0)]) (label_ref (match_operand 3)) (pc)))] @@ -983,7 +983,7 @@ (compare:CC (match_operand:SWIM 2 "nonimmediate_operand") (match_operand:SWIM 3 "<general_operand>"))) (set (match_operand:QI 0 "register_operand") - (match_operator 1 "ordered_comparison_operator" + (match_operator 1 "comparison_operator" [(reg:CC FLAGS_REG) (const_int 0)]))] "" {