Message ID | CAFULd4ZaqyqpHkmHcpzDzEF5Y_spOsvz6cNJHD9zAWnr8QKsOQ@mail.gmail.com |
---|---|
State | New |
Headers | show |
On Tue, Oct 4, 2011 at 10:32 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On Tue, Oct 4, 2011 at 6:06 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > >>> OTOH, x86_64 and i686 targets can also benefit from this change. If >>> combine can't create more complex address (covered by lea), then it >>> will simply propagate memory operand back into the add insn. It looks >>> to me that we can't loose here, so: >>> >>> /* Improve address combine. */ >>> if (code == PLUS && MEM_P (src2)) >>> src2 = force_reg (mode, src2); >>> >>> Any opinions? >>> >> >> It doesn't work with 64bit libstdc++: > > Yeah, yeah. ix86_output_mi_thunk has some ... issues. > > Please try attached patch that introduces ix86_emit_binop and uses it > in a bunch of places. > > Uros. > I tried it on GCC. There are no regressions. The bugs are fixed for x32. Here are size comparison with GCC runtime libraries on ia32, x32 and x86-64: text data bss dec hex filename 103304 480 368 104152 196d8 32/libgcc/32/libgcc_s.so 103048 480 368 103896 195d8 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libgcc/32/libgcc_s.so text data bss dec hex filename 1053846 3100 360 1057306 10221a 32/libgfortran/.libs/libgfortran.so 1053574 3100 360 1057034 10210a ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libgfortran/.libs/libgfortran.so text data bss dec hex filename 56817 536 156 57509 e0a5 32/libgomp/.libs/libgomp.so 56817 536 156 57509 e0a5 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libgomp/.libs/libgomp.so text data bss dec hex filename 107815 1832 839016 948663 e79b7 32/libmudflap/.libs/libmudflap.so 107815 1832 839016 948663 e79b7 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libmudflap/.libs/libmudflap.so text data bss dec hex filename 110890 1976 839052 951918 e866e 32/libmudflap/.libs/libmudflapth.so 110890 1976 839052 951918 e866e ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libmudflap/.libs/libmudflapth.so text data bss dec hex filename 99113 2912 4748 106773 1a115 32/libobjc/.libs/libobjc.so 99113 2912 4748 106773 1a115 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libobjc/.libs/libobjc.so text data bss dec hex filename 357466 1148 12 358626 578e2 32/libquadmath/.libs/libquadmath.so 357218 1148 12 358378 577ea ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libquadmath/.libs/libquadmath.so text data bss dec hex filename 6018 388 8 6414 190e 32/libssp/.libs/libssp.so 6018 388 8 6414 190e ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libssp/.libs/libssp.so text data bss dec hex filename 884093 18600 27064 929757 e2fdd 32/libstdc++-v3/src/.libs/libstdc++.so 884189 18600 27064 929853 e303d ../../build-x86_64-linux/x86_64-unknown-linux-gnu/32/libstdc++-v3/src/.libs/libstdc++.so text data bss dec hex filename 83012 944 696 84652 14aac libgcc/libgcc_s.so 83012 944 696 84652 14aac ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libgcc/libgcc_s.so text data bss dec hex filename 1144593 6520 568 1151681 1192c1 libgfortran/.libs/libgfortran.so 1144561 6520 568 1151649 1192a1 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libgfortran/.libs/libgfortran.so text data bss dec hex filename 51958 1080 260 53298 d032 libgomp/.libs/libgomp.so 51958 1080 260 53298 d032 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libgomp/.libs/libgomp.so text data bss dec hex filename 111831 3332 1634728 1749891 1ab383 libmudflap/.libs/libmudflap.so 111831 3332 1634728 1749891 1ab383 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libmudflap/.libs/libmudflap.so text data bss dec hex filename 116430 3612 1634768 1754810 1ac6ba libmudflap/.libs/libmudflapth.so 116430 3612 1634768 1754810 1ac6ba ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libmudflap/.libs/libmudflapth.so text data bss dec hex filename 102240 4872 9320 116432 1c6d0 libobjc/.libs/libobjc.so 102240 4872 9320 116432 1c6d0 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libobjc/.libs/libobjc.so text data bss dec hex filename 213215 1744 16 214975 347bf libquadmath/.libs/libquadmath.so 213231 1744 16 214991 347cf ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libquadmath/.libs/libquadmath.so text data bss dec hex filename 6395 768 16 7179 1c0b libssp/.libs/libssp.so 6395 768 16 7179 1c0b ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libssp/.libs/libssp.so text data bss dec hex filename 923338 35872 85040 1044250 fef1a libstdc++-v3/src/.libs/libstdc++.so 923306 35872 85040 1044218 feefa ../../build-x86_64-linux/x86_64-unknown-linux-gnu/libstdc++-v3/src/.libs/libstdc++.so text data bss dec hex filename 80384 684 424 81492 13e54 x32/libgcc/x32/libgcc_s.so 80384 684 424 81492 13e54 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libgcc/x32/libgcc_s.so text data bss dec hex filename 1189512 5072 424 1195008 123c00 x32/libgfortran/.libs/libgfortran.so 1188856 5072 424 1194352 123970 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libgfortran/.libs/libgfortran.so text data bss dec hex filename 49028 736 152 49916 c2fc x32/libgomp/.libs/libgomp.so 49008 736 152 49896 c2e8 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libgomp/.libs/libgomp.so text data bss dec hex filename 102415 2592 839016 944023 e6797 x32/libmudflap/.libs/libmudflap.so 102399 2592 839016 944007 e6787 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libmudflap/.libs/libmudflap.so text data bss dec hex filename 106503 2820 839052 948375 e7897 x32/libmudflap/.libs/libmudflapth.so 106487 2820 839052 948359 e7887 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libmudflap/.libs/libmudflapth.so text data bss dec hex filename 96158 3648 4744 104550 19866 x32/libobjc/.libs/libobjc.so 96142 3648 4744 104534 19856 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libobjc/.libs/libobjc.so text data bss dec hex filename 213658 1468 12 215138 34862 x32/libquadmath/.libs/libquadmath.so 213578 1468 12 215058 34812 ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libquadmath/.libs/libquadmath.so text data bss dec hex filename 5551 500 8 6059 17ab x32/libssp/.libs/libssp.so 5551 500 8 6059 17ab ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libssp/.libs/libssp.so text data bss dec hex filename 810875 23224 27512 861611 d25ab x32/libstdc++-v3/src/.libs/libstdc++.so 810655 23224 27512 861391 d24cf ../../build-x86_64-linux/x86_64-unknown-linux-gnu/x32/libstdc++-v3/src/.libs/libstdc++.so The first number is from the old library and the second one is from the new library. The new libraries have smaller sizes. There are some exceptions on both ia32 and x86-64. For ia32 libstdc++: 884093 18600 27064 929757 e2fdd old libstdc++.so 884189 18600 27064 929853 e303d new libs/libstdc++.so The new code is mov 0xc(%edi),%eax mov %eax,0x8(%esi) mov -0xc(%eax),%eax mov 0x10(%edi),%edx lea 0x8(%esi,%eax,1),%eax The old one is mov 0xc(%edi),%edx lea 0x8(%esi),%eax mov %edx,0x8(%esi) add -0xc(%edx),%eax mov 0x10(%edi),%edx
On Tue, Oct 4, 2011 at 8:37 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>> OTOH, x86_64 and i686 targets can also benefit from this change. If >>>> combine can't create more complex address (covered by lea), then it >>>> will simply propagate memory operand back into the add insn. It looks >>>> to me that we can't loose here, so: >>>> >>>> /* Improve address combine. */ >>>> if (code == PLUS && MEM_P (src2)) >>>> src2 = force_reg (mode, src2); >>>> >>>> Any opinions? >>>> >>> >>> It doesn't work with 64bit libstdc++: >> >> Yeah, yeah. ix86_output_mi_thunk has some ... issues. >> >> Please try attached patch that introduces ix86_emit_binop and uses it >> in a bunch of places. > I tried it on GCC. There are no regressions. The bugs are fixed for x32. > Here are size comparison with GCC runtime libraries on ia32, x32 and > x86-64: > 884093 18600 27064 929757 e2fdd old libstdc++.so > 884189 18600 27064 929853 e303d new libs/libstdc++.so > > The new code is > > mov 0xc(%edi),%eax > mov %eax,0x8(%esi) > mov -0xc(%eax),%eax > mov 0x10(%edi),%edx > lea 0x8(%esi,%eax,1),%eax > > The old one is > > mov 0xc(%edi),%edx > lea 0x8(%esi),%eax > mov %edx,0x8(%esi) > add -0xc(%edx),%eax > mov 0x10(%edi),%edx The new code merged lea+add into one lea, so it looks quite OK to me. Do you have some performance numbers? Thanks, Uros.
On Tue, Oct 4, 2011 at 11:51 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On Tue, Oct 4, 2011 at 8:37 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > >>>>> OTOH, x86_64 and i686 targets can also benefit from this change. If >>>>> combine can't create more complex address (covered by lea), then it >>>>> will simply propagate memory operand back into the add insn. It looks >>>>> to me that we can't loose here, so: >>>>> >>>>> /* Improve address combine. */ >>>>> if (code == PLUS && MEM_P (src2)) >>>>> src2 = force_reg (mode, src2); >>>>> >>>>> Any opinions? >>>>> >>>> >>>> It doesn't work with 64bit libstdc++: >>> >>> Yeah, yeah. ix86_output_mi_thunk has some ... issues. >>> >>> Please try attached patch that introduces ix86_emit_binop and uses it >>> in a bunch of places. > >> I tried it on GCC. There are no regressions. The bugs are fixed for x32. >> Here are size comparison with GCC runtime libraries on ia32, x32 and >> x86-64: > >> 884093 18600 27064 929757 e2fdd old libstdc++.so >> 884189 18600 27064 929853 e303d new libs/libstdc++.so >> >> The new code is >> >> mov 0xc(%edi),%eax >> mov %eax,0x8(%esi) >> mov -0xc(%eax),%eax >> mov 0x10(%edi),%edx >> lea 0x8(%esi,%eax,1),%eax >> >> The old one is >> >> mov 0xc(%edi),%edx >> lea 0x8(%esi),%eax >> mov %edx,0x8(%esi) >> add -0xc(%edx),%eax >> mov 0x10(%edi),%edx > > The new code merged lea+add into one lea, so it looks quite OK to me. > > Do you have some performance numbers? > I will report performance numbers in a few days.
On Tue, Oct 4, 2011 at 11:58 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Tue, Oct 4, 2011 at 11:51 AM, Uros Bizjak <ubizjak@gmail.com> wrote: >> On Tue, Oct 4, 2011 at 8:37 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >> >>>>>> OTOH, x86_64 and i686 targets can also benefit from this change. If >>>>>> combine can't create more complex address (covered by lea), then it >>>>>> will simply propagate memory operand back into the add insn. It looks >>>>>> to me that we can't loose here, so: >>>>>> >>>>>> /* Improve address combine. */ >>>>>> if (code == PLUS && MEM_P (src2)) >>>>>> src2 = force_reg (mode, src2); >>>>>> >>>>>> Any opinions? >>>>>> >>>>> >>>>> It doesn't work with 64bit libstdc++: >>>> >>>> Yeah, yeah. ix86_output_mi_thunk has some ... issues. >>>> >>>> Please try attached patch that introduces ix86_emit_binop and uses it >>>> in a bunch of places. >> >>> I tried it on GCC. There are no regressions. The bugs are fixed for x32. >>> Here are size comparison with GCC runtime libraries on ia32, x32 and >>> x86-64: >> >>> 884093 18600 27064 929757 e2fdd old libstdc++.so >>> 884189 18600 27064 929853 e303d new libs/libstdc++.so >>> >>> The new code is >>> >>> mov 0xc(%edi),%eax >>> mov %eax,0x8(%esi) >>> mov -0xc(%eax),%eax >>> mov 0x10(%edi),%edx >>> lea 0x8(%esi,%eax,1),%eax >>> >>> The old one is >>> >>> mov 0xc(%edi),%edx >>> lea 0x8(%esi),%eax >>> mov %edx,0x8(%esi) >>> add -0xc(%edx),%eax >>> mov 0x10(%edi),%edx >> >> The new code merged lea+add into one lea, so it looks quite OK to me. >> >> Do you have some performance numbers? >> > > I will report performance numbers in a few days. The differences in SPEC CPU 2006 on ia32, x86-64 and x32 are within noise range.
Index: i386-protos.h =================================================================== --- i386-protos.h (revision 179506) +++ i386-protos.h (working copy) @@ -94,6 +94,7 @@ extern bool ix86_lea_outperforms (rtx, unsigned in unsigned int, unsigned int); extern bool ix86_avoid_lea_for_add (rtx, rtx[]); extern bool ix86_avoid_lea_for_addr (rtx, rtx[]); +extern void ix86_emit_binop (enum rtx_code, enum machine_mode, rtx, rtx); extern void ix86_split_lea_for_addr (rtx[], enum machine_mode); extern bool ix86_lea_for_add_ok (rtx, rtx[]); extern bool ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high); Index: i386.c =================================================================== --- i386.c (revision 179506) +++ i386.c (working copy) @@ -15727,6 +15727,10 @@ ix86_fixup_binary_operands (enum rtx_code code, en if (MEM_P (src1) && !rtx_equal_p (dst, src1)) src1 = force_reg (mode, src1); + /* Improve address combine. */ + if (code == PLUS && MEM_P (src2)) + src2 = force_reg (mode, src2); + operands[1] = src1; operands[2] = src2; return dst; @@ -16470,6 +16474,20 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost); } +/* Emit x86 binary operand CODE in mode MODE, where the first operand + matches destination. RTX includes clobber of FLAGS_REG. */ + +extern void ix86_emit_binop (enum rtx_code code, enum machine_mode mode, + rtx dst, rtx src) +{ + rtx op, clob; + + op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src)); + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob))); +} + /* Split lea instructions into a sequence of instructions which are executed on ALU to avoid AGU stalls. It is assumed that it is allowed to clobber flags register @@ -16482,8 +16500,7 @@ ix86_split_lea_for_addr (rtx operands[], enum mach unsigned int regno1 = INVALID_REGNUM; unsigned int regno2 = INVALID_REGNUM; struct ix86_address parts; - rtx tmp, clob; - rtvec par; + rtx tmp; int ok, adds; ok = ix86_decompose_address (operands[1], &parts); @@ -16515,14 +16532,7 @@ ix86_split_lea_for_addr (rtx operands[], enum mach gcc_assert (regno2 != regno0); for (adds = parts.scale; adds > 0; adds--) - { - tmp = gen_rtx_PLUS (mode, operands[0], parts.index); - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, - gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } + ix86_emit_binop (PLUS, mode, operands[0], parts.index); } else { @@ -16531,30 +16541,14 @@ ix86_split_lea_for_addr (rtx operands[], enum mach emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index)); /* Use shift for scaling. */ - tmp = gen_rtx_ASHIFT (mode, operands[0], - GEN_INT (exact_log2 (parts.scale))); - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); + ix86_emit_binop (ASHIFT, mode, operands[0], + GEN_INT (exact_log2 (parts.scale))); if (parts.base) - { - tmp = gen_rtx_PLUS (mode, operands[0], parts.base); - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } + ix86_emit_binop (PLUS, mode, operands[0], parts.base); if (parts.disp && parts.disp != const0_rtx) - { - tmp = gen_rtx_PLUS (mode, operands[0], parts.disp); - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } + ix86_emit_binop (PLUS, mode, operands[0], parts.disp); } } else if (!parts.base && !parts.index) @@ -16565,41 +16559,32 @@ ix86_split_lea_for_addr (rtx operands[], enum mach else { if (!parts.base) - { - if (regno0 != regno2) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index)); - } + { + if (regno0 != regno2) + emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.index)); + } else if (!parts.index) - { - if (regno0 != regno1) - emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base)); - } - else - { - if (regno0 == regno1) - tmp = gen_rtx_PLUS (mode, operands[0], parts.index); - else if (regno0 == regno2) - tmp = gen_rtx_PLUS (mode, operands[0], parts.base); - else - { + { + if (regno0 != regno1) emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base)); - tmp = gen_rtx_PLUS (mode, operands[0], parts.index); - } + } + else + { + if (regno0 == regno1) + tmp = parts.index; + else if (regno0 == regno2) + tmp = parts.base; + else + { + emit_insn (gen_rtx_SET (VOIDmode, operands[0], parts.base)); + tmp = parts.index; + } - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } + ix86_emit_binop (PLUS, mode, operands[0], tmp); + } if (parts.disp && parts.disp != const0_rtx) - { - tmp = gen_rtx_PLUS (mode, operands[0], parts.disp); - tmp = gen_rtx_SET (VOIDmode, operands[0], tmp); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - par = gen_rtvec (2, tmp, clob); - emit_insn (gen_rtx_PARALLEL (VOIDmode, par)); - } + ix86_emit_binop (PLUS, mode, operands[0], parts.disp); } } @@ -30940,7 +30925,7 @@ x86_output_mi_thunk (FILE *file, } } - emit_insn (ix86_gen_add3 (delta_dst, delta_dst, delta_rtx)); + ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx); } /* Adjust the this parameter by a value stored in the vtable. */ @@ -30983,7 +30968,7 @@ x86_output_mi_thunk (FILE *file, REGNO (this_reg)), vcall_mem)); else - emit_insn (ix86_gen_add3 (this_reg, this_reg, vcall_mem)); + ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem); } /* If necessary, drop THIS back to its stack slot. */