Message ID | 20230423131903.155998-1-xry111@xry111.site |
---|---|
State | New |
Headers | show |
Series | LoongArch: Enable shrink wrapping | expand |
Ok, I will do spec performance test comparison as soon as possible. Thanks! 在 2023/4/23 下午9:19, Xi Ruoyao 写道: > This commit implements the target macros for shrink wrapping of function > prologues/epilogues shrink wrapping on LoongArch. > > Bootstrapped and regtested on loongarch64-linux-gnu. I don't have an > access to SPEC CPU so I hope the reviewer can perform a benchmark to see > if there is real benefit. > > gcc/ChangeLog: > > * config/loongarch/loongarch.h (struct machine_function): Add > reg_is_wrapped_separately array for register wrapping > information. > * config/loongarch/loongarch.cc > (loongarch_get_separate_components): New function. > (loongarch_components_for_bb): Likewise. > (loongarch_disqualify_components): Likewise. > (loongarch_process_components): Likewise. > (loongarch_emit_prologue_components): Likewise. > (loongarch_emit_epilogue_components): Likewise. > (loongarch_set_handled_components): Likewise. > (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define. > (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise. > (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise. > (loongarch_for_each_saved_reg): Skip registers that are wrapped > separately. > > gcc/testsuite/ChangeLog: > > * gcc.target/loongarch/shrink-wrap.c: New test. > --- > gcc/config/loongarch/loongarch.cc | 179 +++++++++++++++++- > gcc/config/loongarch/loongarch.h | 2 + > .../gcc.target/loongarch/shrink-wrap.c | 22 +++ > 3 files changed, 200 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > > diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc > index e523fcb6b7f..d0024237a6a 100644 > --- a/gcc/config/loongarch/loongarch.cc > +++ b/gcc/config/loongarch/loongarch.cc > @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see > #include "builtins.h" > #include "rtl-iter.h" > #include "opts.h" > +#include "function-abi.h" > > /* This file should be included last. */ > #include "target-def.h" > @@ -1017,19 +1018,23 @@ loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset, > for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > { > - loongarch_save_restore_reg (word_mode, regno, offset, fn); > + if (!cfun->machine->reg_is_wrapped_separately[regno]) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > + > offset -= UNITS_PER_WORD; > } > > /* This loop must iterate over the same space as its companion in > loongarch_compute_frame_info. */ > offset = cfun->machine->frame.fp_sp_offset - sp_offset; > + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + > for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > { > - machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + if (!cfun->machine->reg_is_wrapped_separately[regno]) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > > - loongarch_save_restore_reg (mode, regno, offset, fn); > offset -= GET_MODE_SIZE (mode); > } > } > @@ -6644,6 +6649,151 @@ loongarch_asan_shadow_offset (void) > return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0; > } > > +static sbitmap > +loongarch_get_separate_components (void) > +{ > + HOST_WIDE_INT offset; > + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); > + bitmap_clear (components); > + offset = cfun->machine->frame.gp_sp_offset; > + > + /* The stack should be aligned to 16-bytes boundary, so we can make the use > + of ldptr instructions. */ > + gcc_assert (offset % UNITS_PER_WORD == 0); > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > + { > + /* We can wrap general registers saved at [sp, sp + 32768) using the > + ldptr/stptr instructions. For large offsets a pseudo register > + might be needed which cannot be created during the shrink > + wrapping pass. > + > + TODO: This may need a revise when we add LA32 as ldptr.w is not > + guaranteed available by the manual. */ > + if (offset < 32768) > + bitmap_set_bit (components, regno); > + > + offset -= UNITS_PER_WORD; > + } > + > + offset = cfun->machine->frame.fp_sp_offset; > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > + { > + /* We can only wrap FP registers with imm12 offsets. For large > + offsets a pseudo register might be needed which cannot be > + created during the shrink wrapping pass. */ > + if (IMM12_OPERAND (offset)) > + bitmap_set_bit (components, regno); > + > + offset -= UNITS_PER_FPREG; > + } > + > + /* Don't mess with the hard frame pointer. */ > + if (frame_pointer_needed) > + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); > + > + bitmap_clear_bit (components, RETURN_ADDR_REGNUM); > + > + return components; > +} > + > +static sbitmap > +loongarch_components_for_bb (basic_block bb) > +{ > + /* Registers are used in a bb if they are in the IN, GEN, or KILL sets. */ > + auto_bitmap used; > + bitmap_copy (used, DF_LIVE_IN (bb)); > + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen); > + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill); > + > + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); > + bitmap_clear (components); > + > + function_abi_aggregator callee_abis; > + rtx_insn *insn; > + FOR_BB_INSNS (bb, insn) > + if (CALL_P (insn)) > + callee_abis.note_callee_abi (insn_callee_abi (insn)); > + > + HARD_REG_SET extra_caller_saves = > + callee_abis.caller_save_regs (*crtl->abi); > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (!fixed_regs[regno] > + && !crtl->abi->clobbers_full_reg_p (regno) > + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || > + bitmap_bit_p (used, regno))) > + bitmap_set_bit (components, regno); > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (!fixed_regs[regno] > + && !crtl->abi->clobbers_full_reg_p (regno) > + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || > + bitmap_bit_p (used, regno))) > + bitmap_set_bit (components, regno); > + > + return components; > +} > + > +static void > +loongarch_disqualify_components (sbitmap, edge, sbitmap, bool) > +{ > + /* Do nothing. */ > +} > + > +static void > +loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn) > +{ > + HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset; > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > + { > + if (bitmap_bit_p (components, regno)) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > + > + offset -= UNITS_PER_WORD; > + } > + > + offset = cfun->machine->frame.fp_sp_offset; > + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > + { > + if (bitmap_bit_p (components, regno)) > + loongarch_save_restore_reg (mode, regno, offset, fn); > + > + offset -= UNITS_PER_FPREG; > + } > +} > + > +static void > +loongarch_emit_prologue_components (sbitmap components) > +{ > + loongarch_process_components (components, loongarch_save_reg); > +} > + > +static void > +loongarch_emit_epilogue_components (sbitmap components) > +{ > + loongarch_process_components (components, loongarch_restore_reg); > +} > + > +static void > +loongarch_set_handled_components (sbitmap components) > +{ > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (bitmap_bit_p (components, regno)) > + cfun->machine->reg_is_wrapped_separately[regno] = true; > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (bitmap_bit_p (components, regno)) > + cfun->machine->reg_is_wrapped_separately[regno] = true; > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_ASM_ALIGNED_HI_OP > #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" > @@ -6841,6 +6991,29 @@ loongarch_asan_shadow_offset (void) > #undef TARGET_ASAN_SHADOW_OFFSET > #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset > > +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS > +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ > + loongarch_get_separate_components > + > +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB > +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb > + > +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS > +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ > + loongarch_disqualify_components > + > +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS > +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ > + loongarch_emit_prologue_components > + > +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS > +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ > + loongarch_emit_epilogue_components > + > +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS > +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ > + loongarch_set_handled_components > + > struct gcc_target targetm = TARGET_INITIALIZER; > > #include "gt-loongarch.h" > diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h > index a9eff6a81bd..829acdaa9be 100644 > --- a/gcc/config/loongarch/loongarch.h > +++ b/gcc/config/loongarch/loongarch.h > @@ -1147,6 +1147,8 @@ struct GTY (()) machine_function > /* The current frame information, calculated by loongarch_compute_frame_info. > */ > struct loongarch_frame_info frame; > + > + bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER]; > }; > #endif > > diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > new file mode 100644 > index 00000000000..f2c867a2769 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O -fshrink-wrap" } */ > + > +/* f(x) should do nothing if x is 0. */ > +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */ > + > +void g(void); > + > +void > +f(int x) > +{ > + if (x) > + { > + register int s0 asm("s0") = x; > + register int s1 asm("s1") = x; > + register int s2 asm("s2") = x; > + asm("" : : "r"(s0)); > + asm("" : : "r"(s1)); > + asm("" : : "r"(s2)); > + g(); > + } > +}
+guojie 在 2023/4/23 下午9:19, Xi Ruoyao 写道: > This commit implements the target macros for shrink wrapping of function > prologues/epilogues shrink wrapping on LoongArch. > > Bootstrapped and regtested on loongarch64-linux-gnu. I don't have an > access to SPEC CPU so I hope the reviewer can perform a benchmark to see > if there is real benefit. > > gcc/ChangeLog: > > * config/loongarch/loongarch.h (struct machine_function): Add > reg_is_wrapped_separately array for register wrapping > information. > * config/loongarch/loongarch.cc > (loongarch_get_separate_components): New function. > (loongarch_components_for_bb): Likewise. > (loongarch_disqualify_components): Likewise. > (loongarch_process_components): Likewise. > (loongarch_emit_prologue_components): Likewise. > (loongarch_emit_epilogue_components): Likewise. > (loongarch_set_handled_components): Likewise. > (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define. > (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise. > (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise. > (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise. > (loongarch_for_each_saved_reg): Skip registers that are wrapped > separately. > > gcc/testsuite/ChangeLog: > > * gcc.target/loongarch/shrink-wrap.c: New test. > --- > gcc/config/loongarch/loongarch.cc | 179 +++++++++++++++++- > gcc/config/loongarch/loongarch.h | 2 + > .../gcc.target/loongarch/shrink-wrap.c | 22 +++ > 3 files changed, 200 insertions(+), 3 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > > diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc > index e523fcb6b7f..d0024237a6a 100644 > --- a/gcc/config/loongarch/loongarch.cc > +++ b/gcc/config/loongarch/loongarch.cc > @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see > #include "builtins.h" > #include "rtl-iter.h" > #include "opts.h" > +#include "function-abi.h" > > /* This file should be included last. */ > #include "target-def.h" > @@ -1017,19 +1018,23 @@ loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset, > for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > { > - loongarch_save_restore_reg (word_mode, regno, offset, fn); > + if (!cfun->machine->reg_is_wrapped_separately[regno]) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > + > offset -= UNITS_PER_WORD; > } > > /* This loop must iterate over the same space as its companion in > loongarch_compute_frame_info. */ > offset = cfun->machine->frame.fp_sp_offset - sp_offset; > + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + > for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > { > - machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + if (!cfun->machine->reg_is_wrapped_separately[regno]) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > > - loongarch_save_restore_reg (mode, regno, offset, fn); > offset -= GET_MODE_SIZE (mode); > } > } > @@ -6644,6 +6649,151 @@ loongarch_asan_shadow_offset (void) > return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0; > } > > +static sbitmap > +loongarch_get_separate_components (void) > +{ > + HOST_WIDE_INT offset; > + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); > + bitmap_clear (components); > + offset = cfun->machine->frame.gp_sp_offset; > + > + /* The stack should be aligned to 16-bytes boundary, so we can make the use > + of ldptr instructions. */ > + gcc_assert (offset % UNITS_PER_WORD == 0); > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > + { > + /* We can wrap general registers saved at [sp, sp + 32768) using the > + ldptr/stptr instructions. For large offsets a pseudo register > + might be needed which cannot be created during the shrink > + wrapping pass. > + > + TODO: This may need a revise when we add LA32 as ldptr.w is not > + guaranteed available by the manual. */ > + if (offset < 32768) > + bitmap_set_bit (components, regno); > + > + offset -= UNITS_PER_WORD; > + } > + > + offset = cfun->machine->frame.fp_sp_offset; > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > + { > + /* We can only wrap FP registers with imm12 offsets. For large > + offsets a pseudo register might be needed which cannot be > + created during the shrink wrapping pass. */ > + if (IMM12_OPERAND (offset)) > + bitmap_set_bit (components, regno); > + > + offset -= UNITS_PER_FPREG; > + } > + > + /* Don't mess with the hard frame pointer. */ > + if (frame_pointer_needed) > + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); > + > + bitmap_clear_bit (components, RETURN_ADDR_REGNUM); > + > + return components; > +} > + > +static sbitmap > +loongarch_components_for_bb (basic_block bb) > +{ > + /* Registers are used in a bb if they are in the IN, GEN, or KILL sets. */ > + auto_bitmap used; > + bitmap_copy (used, DF_LIVE_IN (bb)); > + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen); > + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill); > + > + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); > + bitmap_clear (components); > + > + function_abi_aggregator callee_abis; > + rtx_insn *insn; > + FOR_BB_INSNS (bb, insn) > + if (CALL_P (insn)) > + callee_abis.note_callee_abi (insn_callee_abi (insn)); > + > + HARD_REG_SET extra_caller_saves = > + callee_abis.caller_save_regs (*crtl->abi); > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (!fixed_regs[regno] > + && !crtl->abi->clobbers_full_reg_p (regno) > + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || > + bitmap_bit_p (used, regno))) > + bitmap_set_bit (components, regno); > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (!fixed_regs[regno] > + && !crtl->abi->clobbers_full_reg_p (regno) > + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || > + bitmap_bit_p (used, regno))) > + bitmap_set_bit (components, regno); > + > + return components; > +} > + > +static void > +loongarch_disqualify_components (sbitmap, edge, sbitmap, bool) > +{ > + /* Do nothing. */ > +} > + > +static void > +loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn) > +{ > + HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset; > + > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) > + { > + if (bitmap_bit_p (components, regno)) > + loongarch_save_restore_reg (word_mode, regno, offset, fn); > + > + offset -= UNITS_PER_WORD; > + } > + > + offset = cfun->machine->frame.fp_sp_offset; > + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) > + { > + if (bitmap_bit_p (components, regno)) > + loongarch_save_restore_reg (mode, regno, offset, fn); > + > + offset -= UNITS_PER_FPREG; > + } > +} > + > +static void > +loongarch_emit_prologue_components (sbitmap components) > +{ > + loongarch_process_components (components, loongarch_save_reg); > +} > + > +static void > +loongarch_emit_epilogue_components (sbitmap components) > +{ > + loongarch_process_components (components, loongarch_restore_reg); > +} > + > +static void > +loongarch_set_handled_components (sbitmap components) > +{ > + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) > + if (bitmap_bit_p (components, regno)) > + cfun->machine->reg_is_wrapped_separately[regno] = true; > + > + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) > + if (bitmap_bit_p (components, regno)) > + cfun->machine->reg_is_wrapped_separately[regno] = true; > +} > + > /* Initialize the GCC target structure. */ > #undef TARGET_ASM_ALIGNED_HI_OP > #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" > @@ -6841,6 +6991,29 @@ loongarch_asan_shadow_offset (void) > #undef TARGET_ASAN_SHADOW_OFFSET > #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset > > +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS > +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ > + loongarch_get_separate_components > + > +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB > +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb > + > +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS > +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ > + loongarch_disqualify_components > + > +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS > +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ > + loongarch_emit_prologue_components > + > +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS > +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ > + loongarch_emit_epilogue_components > + > +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS > +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ > + loongarch_set_handled_components > + > struct gcc_target targetm = TARGET_INITIALIZER; > > #include "gt-loongarch.h" > diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h > index a9eff6a81bd..829acdaa9be 100644 > --- a/gcc/config/loongarch/loongarch.h > +++ b/gcc/config/loongarch/loongarch.h > @@ -1147,6 +1147,8 @@ struct GTY (()) machine_function > /* The current frame information, calculated by loongarch_compute_frame_info. > */ > struct loongarch_frame_info frame; > + > + bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER]; > }; > #endif > > diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > new file mode 100644 > index 00000000000..f2c867a2769 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O -fshrink-wrap" } */ > + > +/* f(x) should do nothing if x is 0. */ > +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */ > + > +void g(void); > + > +void > +f(int x) > +{ > + if (x) > + { > + register int s0 asm("s0") = x; > + register int s1 asm("s1") = x; > + register int s2 asm("s2") = x; > + asm("" : : "r"(s0)); > + asm("" : : "r"(s1)); > + asm("" : : "r"(s2)); > + g(); > + } > +}
/* snip */ >> diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >> b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >> new file mode 100644 >> index 00000000000..f2c867a2769 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >> @@ -0,0 +1,22 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O -fshrink-wrap" } */ >> + >> +/* f(x) should do nothing if x is 0. */ >> +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" >> } } */ >> + >> +void g(void); >> + >> +void >> +f(int x) >> +{ >> + if (x) >> + { >> + register int s0 asm("s0") = x; >> + register int s1 asm("s1") = x; >> + register int s2 asm("s2") = x; >> + asm("" : : "r"(s0)); >> + asm("" : : "r"(s1)); >> + asm("" : : "r"(s2)); >> + g(); >> + } >> +} I think the test case cannot fully reflect the optimization effect of the current patch, because even without the patch, -O -fshrink-wrap will still perform architecture independent optimization. This patch considers architecture related registers as finer grained optimization for shrink wrapping, I think a test case like the one below is more suitable: int foo(int x) { if (x) { __asm__ ("":::"s0","s1"); return x; } __asm__ ("":::"s2","s3"); return 0; } Otherwise LGTM, thanks!
Hi, ruoyao: The performance of spec2006 is finished. The fixed-point 400.perlbench has about 3% performance improvement, and the other basics have not changed, and the floating-point tests have basically remained the same. Do you have any questions about the test cases mentioned by Guo Jie? If there is no problem, modify the test case, I think the code can be merged into the main branch. Thanks! 在 2023/4/25 下午5:12, Guo Jie 写道: > /* snip */ > >>> diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >>> b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >>> new file mode 100644 >>> index 00000000000..f2c867a2769 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c >>> @@ -0,0 +1,22 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O -fshrink-wrap" } */ >>> + >>> +/* f(x) should do nothing if x is 0. */ >>> +/* { dg-final { scan-assembler >>> "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */ >>> + >>> +void g(void); >>> + >>> +void >>> +f(int x) >>> +{ >>> + if (x) >>> + { >>> + register int s0 asm("s0") = x; >>> + register int s1 asm("s1") = x; >>> + register int s2 asm("s2") = x; >>> + asm("" : : "r"(s0)); >>> + asm("" : : "r"(s1)); >>> + asm("" : : "r"(s2)); >>> + g(); >>> + } >>> +} > > I think the test case cannot fully reflect the optimization effect of > the current patch, > > because even without the patch, -O -fshrink-wrap will still perform > architecture independent optimization. > > This patch considers architecture related registers as finer grained > optimization for shrink wrapping, > > I think a test case like the one below is more suitable: > > > int foo(int x) > { > if (x) > { > __asm__ ("":::"s0","s1"); > return x; > } > > __asm__ ("":::"s2","s3"); > return 0; > } > > Otherwise LGTM, thanks!
On 2023/4/26 17:53, Lulu Cheng wrote: > Hi, ruoyao: > > The performance of spec2006 is finished. The fixed-point > 400.perlbench has about 3% performance improvement, > > and the other basics have not changed, and the floating-point tests > have basically remained the same. Nice to know! > > Do you have any questions about the test cases mentioned by Guo > Jie? If there is no problem, modify the test case, > > I think the code can be merged into the main branch. > > <snip> BTW what about the previous function/loop alignment patches? The LLVM changes are also waiting for such results. ;-)
在 2023/4/26 下午6:02, WANG Xuerui 写道: > > On 2023/4/26 17:53, Lulu Cheng wrote: >> Hi, ruoyao: >> >> The performance of spec2006 is finished. The fixed-point >> 400.perlbench has about 3% performance improvement, >> >> and the other basics have not changed, and the floating-point tests >> have basically remained the same. > Nice to know! >> >> Do you have any questions about the test cases mentioned by Guo >> Jie? If there is no problem, modify the test case, >> >> I think the code can be merged into the main branch. >> >> <snip> > BTW what about the previous function/loop alignment patches? The LLVM > changes are also waiting for such results. ;-) Well, there are many combinations in this align test, so the test time will be very long. I will reply the result as soon as the test results come out.:-)
On 2023/4/26 18:14, Lulu Cheng wrote: > > 在 2023/4/26 下午6:02, WANG Xuerui 写道: >> >> On 2023/4/26 17:53, Lulu Cheng wrote: >>> Hi, ruoyao: >>> >>> The performance of spec2006 is finished. The fixed-point >>> 400.perlbench has about 3% performance improvement, >>> >>> and the other basics have not changed, and the floating-point tests >>> have basically remained the same. >> Nice to know! >>> >>> Do you have any questions about the test cases mentioned by >>> Guo Jie? If there is no problem, modify the test case, >>> >>> I think the code can be merged into the main branch. >>> >>> <snip> >> BTW what about the previous function/loop alignment patches? The LLVM >> changes are also waiting for such results. ;-) > Well, there are many combinations in this align test, so the test time > will be very long. I will reply the result as soon as the test results > come out.:-) > Oh, I got. Thanks very much for all the tests and take your time!
On Wed, 2023-04-26 at 17:53 +0800, Lulu Cheng wrote: > Hi, ruoyao: > > The performance of spec2006 is finished. The fixed-point > 400.perlbench has about 3% performance improvement, > > and the other basics have not changed, and the floating-point tests > have > basically remained the same. > > Do you have any questions about the test cases mentioned by Guo > Jie? If there is no problem, modify the test case, > > I think the code can be merged into the main branch. I'll rewrite the test and commit in a few days (now I'm occupied with something :( ). > > Thanks! > > 在 2023/4/25 下午5:12, Guo Jie 写道: > > /* snip */ > > > > > > diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > > > > b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > > > > new file mode 100644 > > > > index 00000000000..f2c867a2769 > > > > --- /dev/null > > > > +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c > > > > @@ -0,0 +1,22 @@ > > > > +/* { dg-do compile } */ > > > > +/* { dg-options "-O -fshrink-wrap" } */ > > > > + > > > > +/* f(x) should do nothing if x is 0. */ > > > > +/* { dg-final { scan-assembler > > > > "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */ > > > > + > > > > +void g(void); > > > > + > > > > +void > > > > +f(int x) > > > > +{ > > > > + if (x) > > > > + { > > > > + register int s0 asm("s0") = x; > > > > + register int s1 asm("s1") = x; > > > > + register int s2 asm("s2") = x; > > > > + asm("" : : "r"(s0)); > > > > + asm("" : : "r"(s1)); > > > > + asm("" : : "r"(s2)); > > > > + g(); > > > > + } > > > > +} > > > > I think the test case cannot fully reflect the optimization effect > > of > > the current patch, > > > > because even without the patch, -O -fshrink-wrap will still perform > > architecture independent optimization. > > > > This patch considers architecture related registers as finer grained > > optimization for shrink wrapping, > > > > I think a test case like the one below is more suitable: > > > > > > int foo(int x) > > { > > if (x) > > { > > __asm__ ("":::"s0","s1"); > > return x; > > } > > > > __asm__ ("":::"s2","s3"); > > return 0; > > } > > > > Otherwise LGTM, thanks! >
On Wed, 2023-04-26 at 18:21 +0800, WANG Xuerui wrote: > On 2023/4/26 18:14, Lulu Cheng wrote: > > > > 在 2023/4/26 下午6:02, WANG Xuerui 写道: > > > > > > On 2023/4/26 17:53, Lulu Cheng wrote: > > > > Hi, ruoyao: > > > > > > > > The performance of spec2006 is finished. The fixed-point > > > > 400.perlbench has about 3% performance improvement, > > > > > > > > and the other basics have not changed, and the floating-point tests > > > > have basically remained the same. > > > Nice to know! > > > > > > > > Do you have any questions about the test cases mentioned by > > > > Guo Jie? If there is no problem, modify the test case, > > > > > > > > I think the code can be merged into the main branch. > > > > > > > > <snip> > > > BTW what about the previous function/loop alignment patches? The LLVM > > > changes are also waiting for such results. ;-) > > Well, there are many combinations in this align test, so the test time > > will be very long. I will reply the result as soon as the test results > > come out.:-) > > > Oh, I got. Thanks very much for all the tests and take your time! Sorry if it's noisy, but I hope there is some (maybe preliminary) result: now I finally have some spare time to rebuild the system with GCC 13 and I'd like to use some -falign-functions= in my CFLAGS :).
在 2023/5/7 上午1:07, Xi Ruoyao 写道: > On Wed, 2023-04-26 at 18:21 +0800, WANG Xuerui wrote: >> On 2023/4/26 18:14, Lulu Cheng wrote: >>> 在 2023/4/26 下午6:02, WANG Xuerui 写道: >>>> On 2023/4/26 17:53, Lulu Cheng wrote: >>>>> Hi, ruoyao: >>>>> >>>>> The performance of spec2006 is finished. The fixed-point >>>>> 400.perlbench has about 3% performance improvement, >>>>> >>>>> and the other basics have not changed, and the floating-point tests >>>>> have basically remained the same. >>>> Nice to know! >>>>> Do you have any questions about the test cases mentioned by >>>>> Guo Jie? If there is no problem, modify the test case, >>>>> >>>>> I think the code can be merged into the main branch. >>>>> >>>>> <snip> >>>> BTW what about the previous function/loop alignment patch.es? The LLVM >>>> changes are also waiting for such results. ;-) >>> Well, there are many combinations in this align test, so the test time >>> will be very long. I will reply the result as soon as the test results >>> come out.:-) >>> >> Oh, I got. Thanks very much for all the tests and take your time! > Sorry if it's noisy, but I hope there is some (maybe preliminary) > result: now I finally have some spare time to rebuild the system with > GCC 13 and I'd like to use some -falign-functions= in my CFLAGS :). > The test is still ongoing, and I will reply to the results by email after the test is completed.:-)
diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index e523fcb6b7f..d0024237a6a 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3. If not see #include "builtins.h" #include "rtl-iter.h" #include "opts.h" +#include "function-abi.h" /* This file should be included last. */ #include "target-def.h" @@ -1017,19 +1018,23 @@ loongarch_for_each_saved_reg (HOST_WIDE_INT sp_offset, for (int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) { - loongarch_save_restore_reg (word_mode, regno, offset, fn); + if (!cfun->machine->reg_is_wrapped_separately[regno]) + loongarch_save_restore_reg (word_mode, regno, offset, fn); + offset -= UNITS_PER_WORD; } /* This loop must iterate over the same space as its companion in loongarch_compute_frame_info. */ offset = cfun->machine->frame.fp_sp_offset - sp_offset; + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; + for (int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) { - machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; + if (!cfun->machine->reg_is_wrapped_separately[regno]) + loongarch_save_restore_reg (word_mode, regno, offset, fn); - loongarch_save_restore_reg (mode, regno, offset, fn); offset -= GET_MODE_SIZE (mode); } } @@ -6644,6 +6649,151 @@ loongarch_asan_shadow_offset (void) return TARGET_64BIT ? (HOST_WIDE_INT_1 << 46) : 0; } +static sbitmap +loongarch_get_separate_components (void) +{ + HOST_WIDE_INT offset; + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + offset = cfun->machine->frame.gp_sp_offset; + + /* The stack should be aligned to 16-bytes boundary, so we can make the use + of ldptr instructions. */ + gcc_assert (offset % UNITS_PER_WORD == 0); + + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) + { + /* We can wrap general registers saved at [sp, sp + 32768) using the + ldptr/stptr instructions. For large offsets a pseudo register + might be needed which cannot be created during the shrink + wrapping pass. + + TODO: This may need a revise when we add LA32 as ldptr.w is not + guaranteed available by the manual. */ + if (offset < 32768) + bitmap_set_bit (components, regno); + + offset -= UNITS_PER_WORD; + } + + offset = cfun->machine->frame.fp_sp_offset; + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) + { + /* We can only wrap FP registers with imm12 offsets. For large + offsets a pseudo register might be needed which cannot be + created during the shrink wrapping pass. */ + if (IMM12_OPERAND (offset)) + bitmap_set_bit (components, regno); + + offset -= UNITS_PER_FPREG; + } + + /* Don't mess with the hard frame pointer. */ + if (frame_pointer_needed) + bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); + + bitmap_clear_bit (components, RETURN_ADDR_REGNUM); + + return components; +} + +static sbitmap +loongarch_components_for_bb (basic_block bb) +{ + /* Registers are used in a bb if they are in the IN, GEN, or KILL sets. */ + auto_bitmap used; + bitmap_copy (used, DF_LIVE_IN (bb)); + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->gen); + bitmap_ior_into (used, &DF_LIVE_BB_INFO (bb)->kill); + + sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER); + bitmap_clear (components); + + function_abi_aggregator callee_abis; + rtx_insn *insn; + FOR_BB_INSNS (bb, insn) + if (CALL_P (insn)) + callee_abis.note_callee_abi (insn_callee_abi (insn)); + + HARD_REG_SET extra_caller_saves = + callee_abis.caller_save_regs (*crtl->abi); + + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) + if (!fixed_regs[regno] + && !crtl->abi->clobbers_full_reg_p (regno) + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || + bitmap_bit_p (used, regno))) + bitmap_set_bit (components, regno); + + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) + if (!fixed_regs[regno] + && !crtl->abi->clobbers_full_reg_p (regno) + && (TEST_HARD_REG_BIT (extra_caller_saves, regno) || + bitmap_bit_p (used, regno))) + bitmap_set_bit (components, regno); + + return components; +} + +static void +loongarch_disqualify_components (sbitmap, edge, sbitmap, bool) +{ + /* Do nothing. */ +} + +static void +loongarch_process_components (sbitmap components, loongarch_save_restore_fn fn) +{ + HOST_WIDE_INT offset = cfun->machine->frame.gp_sp_offset; + + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) + if (BITSET_P (cfun->machine->frame.mask, regno - GP_REG_FIRST)) + { + if (bitmap_bit_p (components, regno)) + loongarch_save_restore_reg (word_mode, regno, offset, fn); + + offset -= UNITS_PER_WORD; + } + + offset = cfun->machine->frame.fp_sp_offset; + machine_mode mode = TARGET_DOUBLE_FLOAT ? DFmode : SFmode; + + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) + if (BITSET_P (cfun->machine->frame.fmask, regno - FP_REG_FIRST)) + { + if (bitmap_bit_p (components, regno)) + loongarch_save_restore_reg (mode, regno, offset, fn); + + offset -= UNITS_PER_FPREG; + } +} + +static void +loongarch_emit_prologue_components (sbitmap components) +{ + loongarch_process_components (components, loongarch_save_reg); +} + +static void +loongarch_emit_epilogue_components (sbitmap components) +{ + loongarch_process_components (components, loongarch_restore_reg); +} + +static void +loongarch_set_handled_components (sbitmap components) +{ + for (unsigned int regno = GP_REG_FIRST; regno <= GP_REG_LAST; regno++) + if (bitmap_bit_p (components, regno)) + cfun->machine->reg_is_wrapped_separately[regno] = true; + + for (unsigned int regno = FP_REG_FIRST; regno <= FP_REG_LAST; regno++) + if (bitmap_bit_p (components, regno)) + cfun->machine->reg_is_wrapped_separately[regno] = true; +} + /* Initialize the GCC target structure. */ #undef TARGET_ASM_ALIGNED_HI_OP #define TARGET_ASM_ALIGNED_HI_OP "\t.half\t" @@ -6841,6 +6991,29 @@ loongarch_asan_shadow_offset (void) #undef TARGET_ASAN_SHADOW_OFFSET #define TARGET_ASAN_SHADOW_OFFSET loongarch_asan_shadow_offset +#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS +#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ + loongarch_get_separate_components + +#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB +#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB loongarch_components_for_bb + +#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS +#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ + loongarch_disqualify_components + +#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ + loongarch_emit_prologue_components + +#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS +#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ + loongarch_emit_epilogue_components + +#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS +#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ + loongarch_set_handled_components + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-loongarch.h" diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index a9eff6a81bd..829acdaa9be 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -1147,6 +1147,8 @@ struct GTY (()) machine_function /* The current frame information, calculated by loongarch_compute_frame_info. */ struct loongarch_frame_info frame; + + bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER]; }; #endif diff --git a/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c new file mode 100644 index 00000000000..f2c867a2769 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/shrink-wrap.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O -fshrink-wrap" } */ + +/* f(x) should do nothing if x is 0. */ +/* { dg-final { scan-assembler "bnez\t\\\$r4,\[^\n\]*\n\tjr\t\\\$r1" } } */ + +void g(void); + +void +f(int x) +{ + if (x) + { + register int s0 asm("s0") = x; + register int s1 asm("s1") = x; + register int s2 asm("s2") = x; + asm("" : : "r"(s0)); + asm("" : : "r"(s1)); + asm("" : : "r"(s2)); + g(); + } +}