Message ID | 20231212064754.6623-1-xry111@xry111.site |
---|---|
State | New |
Headers | show |
Series | LoongArch: Replace -mexplicit-relocs=auto simple-used address peephole2 with combine | expand |
Ping :). On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote: > The problem with peephole2 is it uses a naive sliding-window algorithm > and misses many cases. For example: > > float a[10000]; > float t() { return a[0] + a[8000]; } > > is compiled to: > > la.local $r13,a > la.local $r12,a+32768 > fld.s $f1,$r13,0 > fld.s $f0,$r12,-768 > fadd.s $f0,$f1,$f0 > > by trunk. But as we've explained in r14-4851, the following would be > better with -mexplicit-relocs=auto: > > pcalau12i $r13,%pc_hi20(a) > pcalau12i $r12,%pc_hi20(a+32000) > fld.s $f1,$r13,%pc_lo12(a) > fld.s $f0,$r12,%pc_lo12(a+32000) > fadd.s $f0,$f1,$f0 > > However the sliding-window algorithm just won't detect the pcalau12i/fld > pair to be optimized. Use a define_insn_and_split in combine pass will > work around the issue. > > gcc/ChangeLog: > > * config/loongarch/loongarch.md: > (simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New > define_insn_and_split. > (simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise. > (simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise. > (simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>): > Likewise. > (simple_store<ST_ANY:mode><P:mode>): Likewise. > (simple_store_off<ST_ANY:mode><P:mode>): Likewise. > (define_peephole2): Remove la.local/[f]ld peepholes. > > gcc/testsuite/ChangeLog: > > * gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c: > New test. > --- > > Bootstrapped & regtested on loongarch64-linux-gnu. Ok for trunk? > > gcc/config/loongarch/loongarch.md | 165 +++++++++--------- > ...explicit-relocs-auto-single-load-store-2.c | 11 ++ > 2 files changed, 98 insertions(+), 78 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c > > diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md > index 7b26d15aa4e..4009de408fb 100644 > --- a/gcc/config/loongarch/loongarch.md > +++ b/gcc/config/loongarch/loongarch.md > @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w" > ;; > ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with > ;; 3 instructions). > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") > - (mem:LD_AT_LEAST_32_BIT (match_dup 0)))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0]) \ > - || REGNO (operands[0]) == REGNO (operands[2]))" > - [(set (match_dup 2) > - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] > +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>" > + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") > + (mem:LD_AT_LEAST_32_BIT > + (match_operand:P 1 "symbolic_pcrel_operand" "")))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (match_dup 0) > + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] > { > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + operands[2] = gen_reg_rtx (Pmode); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); > }) > > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") > - (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0) > - (match_operand 3 "const_int_operand"))))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0]) \ > - || REGNO (operands[0]) == REGNO (operands[2]))" > - [(set (match_dup 2) > - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] > +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>" > + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") > + (mem:LD_AT_LEAST_32_BIT > + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") > + (match_operand 2 "const_int_operand" ""))))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (match_dup 0) > + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] > { > - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + HOST_WIDE_INT offset = INTVAL (operands[2]); > + operands[2] = gen_reg_rtx (Pmode); > + operands[1] = plus_constant (Pmode, operands[1], offset); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); > }) > > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (match_operand:GPR 2 "register_operand") > - (any_extend:GPR (mem:SUBDI (match_dup 0))))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0]) \ > - || REGNO (operands[0]) == REGNO (operands[2]))" > - [(set (match_dup 2) > - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) > - (match_dup 1)))))] > +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>" > + [(set (match_operand:GPR 0 "register_operand" "=r") > + (any_extend:GPR > + (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (match_dup 0) > + (any_extend:GPR > + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] > { > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + operands[2] = gen_reg_rtx (Pmode); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); > }) > > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (match_operand:GPR 2 "register_operand") > +(define_insn_and_split > + "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>" > + [(set (match_operand:GPR 0 "register_operand" "=r") > + (any_extend:GPR > + (mem:SUBDI > + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") > + (match_operand 2 "const_int_operand" "")))))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (match_dup 0) > (any_extend:GPR > - (mem:SUBDI (plus (match_dup 0) > - (match_operand 3 "const_int_operand")))))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0]) \ > - || REGNO (operands[0]) == REGNO (operands[2]))" > - [(set (match_dup 2) > - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) > - (match_dup 1)))))] > + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] > { > - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + HOST_WIDE_INT offset = INTVAL (operands[2]); > + operands[2] = gen_reg_rtx (Pmode); > + operands[1] = plus_constant (Pmode, operands[1], offset); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); > }) > > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (mem:ST_ANY (match_dup 0)) > - (match_operand:ST_ANY 2 "register_operand"))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0])) \ > - && REGNO (operands[0]) != REGNO (operands[2])" > - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] > +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>" > + [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand")) > + (match_operand:ST_ANY 1 "register_operand" "r,f"))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))] > { > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + operands[2] = gen_reg_rtx (Pmode); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0])); > }) > > -(define_peephole2 > - [(set (match_operand:P 0 "register_operand") > - (match_operand:P 1 "symbolic_pcrel_operand")) > - (set (mem:ST_ANY (plus (match_dup 0) > - (match_operand 3 "const_int_operand"))) > - (match_operand:ST_ANY 2 "register_operand"))] > - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ > - && (peep2_reg_dead_p (2, operands[0])) \ > - && REGNO (operands[0]) != REGNO (operands[2])" > - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] > +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>" > + [(set (mem:ST_ANY > + (plus (match_operand:P 0 "symbolic_pcrel_operand" "") > + (match_operand 1 "const_int_operand" ""))) > + (match_operand:ST_ANY 2 "register_operand" "r,f"))] > + "loongarch_pre_reload_split () \ > + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ > + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" > + "#" > + "" > + [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))] > { > - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); > - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); > + HOST_WIDE_INT offset = INTVAL (operands[1]); > + operands[1] = gen_reg_rtx (Pmode); > + operands[0] = plus_constant (Pmode, operands[0], offset); > + emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0])); > }) > > ;; Synchronization instructions. > diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c > new file mode 100644 > index 00000000000..42cb966d1e0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */ > + > +float a[8001]; > +float > +t (void) > +{ > + return a[0] + a[8000]; > +} > + > +/* { dg-final { scan-assembler-not "la.local" } } */
Sorry, I've been busy with something else these two days. I don't think there's anything wrong with the code, but I need to test the spec.:-) 在 2023/12/21 下午7:56, Xi Ruoyao 写道: > Ping :). > > On Tue, 2023-12-12 at 14:47 +0800, Xi Ruoyao wrote: >> The problem with peephole2 is it uses a naive sliding-window algorithm >> and misses many cases. For example: >> >> float a[10000]; >> float t() { return a[0] + a[8000]; } >> >> is compiled to: >> >> la.local $r13,a >> la.local $r12,a+32768 >> fld.s $f1,$r13,0 >> fld.s $f0,$r12,-768 >> fadd.s $f0,$f1,$f0 >> >> by trunk. But as we've explained in r14-4851, the following would be >> better with -mexplicit-relocs=auto: >> >> pcalau12i $r13,%pc_hi20(a) >> pcalau12i $r12,%pc_hi20(a+32000) >> fld.s $f1,$r13,%pc_lo12(a) >> fld.s $f0,$r12,%pc_lo12(a+32000) >> fadd.s $f0,$f1,$f0 >> >> However the sliding-window algorithm just won't detect the pcalau12i/fld >> pair to be optimized. Use a define_insn_and_split in combine pass will >> work around the issue. >> >> gcc/ChangeLog: >> >> * config/loongarch/loongarch.md: >> (simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>): New >> define_insn_and_split. >> (simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>): Likewise. >> (simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>): Likewise. >> (simple_load_off<su>ext<P:mode><SUBDI:mode><GPR:mode>): >> Likewise. >> (simple_store<ST_ANY:mode><P:mode>): Likewise. >> (simple_store_off<ST_ANY:mode><P:mode>): Likewise. >> (define_peephole2): Remove la.local/[f]ld peepholes. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c: >> New test. >> --- >> >> Bootstrapped & regtested on loongarch64-linux-gnu. Ok for trunk? >> >> gcc/config/loongarch/loongarch.md | 165 +++++++++--------- >> ...explicit-relocs-auto-single-load-store-2.c | 11 ++ >> 2 files changed, 98 insertions(+), 78 deletions(-) >> create mode 100644 gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c >> >> diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md >> index 7b26d15aa4e..4009de408fb 100644 >> --- a/gcc/config/loongarch/loongarch.md >> +++ b/gcc/config/loongarch/loongarch.md >> @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w" >> ;; >> ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with >> ;; 3 instructions). >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") >> - (mem:LD_AT_LEAST_32_BIT (match_dup 0)))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0]) \ >> - || REGNO (operands[0]) == REGNO (operands[2]))" >> - [(set (match_dup 2) >> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] >> +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>" >> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") >> + (mem:LD_AT_LEAST_32_BIT >> + (match_operand:P 1 "symbolic_pcrel_operand" "")))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (match_dup 0) >> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] >> { >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + operands[2] = gen_reg_rtx (Pmode); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); >> }) >> >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") >> - (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0) >> - (match_operand 3 "const_int_operand"))))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0]) \ >> - || REGNO (operands[0]) == REGNO (operands[2]))" >> - [(set (match_dup 2) >> - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] >> +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>" >> + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") >> + (mem:LD_AT_LEAST_32_BIT >> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") >> + (match_operand 2 "const_int_operand" ""))))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (match_dup 0) >> + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] >> { >> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + HOST_WIDE_INT offset = INTVAL (operands[2]); >> + operands[2] = gen_reg_rtx (Pmode); >> + operands[1] = plus_constant (Pmode, operands[1], offset); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); >> }) >> >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (match_operand:GPR 2 "register_operand") >> - (any_extend:GPR (mem:SUBDI (match_dup 0))))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0]) \ >> - || REGNO (operands[0]) == REGNO (operands[2]))" >> - [(set (match_dup 2) >> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) >> - (match_dup 1)))))] >> +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>" >> + [(set (match_operand:GPR 0 "register_operand" "=r") >> + (any_extend:GPR >> + (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (match_dup 0) >> + (any_extend:GPR >> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] >> { >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + operands[2] = gen_reg_rtx (Pmode); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); >> }) >> >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (match_operand:GPR 2 "register_operand") >> +(define_insn_and_split >> + "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>" >> + [(set (match_operand:GPR 0 "register_operand" "=r") >> + (any_extend:GPR >> + (mem:SUBDI >> + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") >> + (match_operand 2 "const_int_operand" "")))))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (match_dup 0) >> (any_extend:GPR >> - (mem:SUBDI (plus (match_dup 0) >> - (match_operand 3 "const_int_operand")))))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0]) \ >> - || REGNO (operands[0]) == REGNO (operands[2]))" >> - [(set (match_dup 2) >> - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) >> - (match_dup 1)))))] >> + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] >> { >> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + HOST_WIDE_INT offset = INTVAL (operands[2]); >> + operands[2] = gen_reg_rtx (Pmode); >> + operands[1] = plus_constant (Pmode, operands[1], offset); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); >> }) >> >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (mem:ST_ANY (match_dup 0)) >> - (match_operand:ST_ANY 2 "register_operand"))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0])) \ >> - && REGNO (operands[0]) != REGNO (operands[2])" >> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] >> +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>" >> + [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand")) >> + (match_operand:ST_ANY 1 "register_operand" "r,f"))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))] >> { >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + operands[2] = gen_reg_rtx (Pmode); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0])); >> }) >> >> -(define_peephole2 >> - [(set (match_operand:P 0 "register_operand") >> - (match_operand:P 1 "symbolic_pcrel_operand")) >> - (set (mem:ST_ANY (plus (match_dup 0) >> - (match_operand 3 "const_int_operand"))) >> - (match_operand:ST_ANY 2 "register_operand"))] >> - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ >> - && (peep2_reg_dead_p (2, operands[0])) \ >> - && REGNO (operands[0]) != REGNO (operands[2])" >> - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] >> +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>" >> + [(set (mem:ST_ANY >> + (plus (match_operand:P 0 "symbolic_pcrel_operand" "") >> + (match_operand 1 "const_int_operand" ""))) >> + (match_operand:ST_ANY 2 "register_operand" "r,f"))] >> + "loongarch_pre_reload_split () \ >> + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ >> + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" >> + "#" >> + "" >> + [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))] >> { >> - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); >> - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); >> + HOST_WIDE_INT offset = INTVAL (operands[1]); >> + operands[1] = gen_reg_rtx (Pmode); >> + operands[0] = plus_constant (Pmode, operands[0], offset); >> + emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0])); >> }) >> >> ;; Synchronization instructions. >> diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c >> new file mode 100644 >> index 00000000000..42cb966d1e0 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c >> @@ -0,0 +1,11 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */ >> + >> +float a[8001]; >> +float >> +t (void) >> +{ >> + return a[0] + a[8000]; >> +} >> + >> +/* { dg-final { scan-assembler-not "la.local" } } */
在 2023/12/21 下午8:00, chenglulu 写道: > Sorry, I've been busy with something else these two days. I don't > think there's anything wrong with the code, > > but I need to test the spec.:-) Hi, Ruoyao: After applying this patch, spec2006 464.h264 ref will have a 6.4% performance drop. So I'm going to retest it. > > 在 2023/12/21 下午7:56, Xi Ruoyao 写道: >> Ping :). >>
On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote: > > 在 2023/12/21 下午8:00, chenglulu 写道: > > Sorry, I've been busy with something else these two days. I don't > > think there's anything wrong with the code, > > > > but I need to test the spec.:-) > > Hi, Ruoyao: > > After applying this patch, spec2006 464.h264 ref will have a 6.4% > performance drop. So I'm going to retest it. I think 6.4% is large enough not to be a random error. Is there an example showing the code regression? And I'm wondering if keeping the peephole besides the new define_insn_and_split produces a better result instead of solely relying on define_insn_and_split?
在 2023/12/22 下午3:09, Xi Ruoyao 写道: > On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote: >> 在 2023/12/21 下午8:00, chenglulu 写道: >>> Sorry, I've been busy with something else these two days. I don't >>> think there's anything wrong with the code, >>> >>> but I need to test the spec.:-) >> Hi, Ruoyao: >> >> After applying this patch, spec2006 464.h264 ref will have a 6.4% >> performance drop. So I'm going to retest it. > I think 6.4% is large enough not to be a random error. > > Is there an example showing the code regression? > > And I'm wondering if keeping the peephole besides the new > define_insn_and_split produces a better result instead of solely relying > on define_insn_and_split? > I haven't debugged this yet, I'm retesting, if there is still such a big performance gap, I think I need to see the reason.
在 2023/12/22 下午3:21, chenglulu 写道: > > 在 2023/12/22 下午3:09, Xi Ruoyao 写道: >> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote: >>> 在 2023/12/21 下午8:00, chenglulu 写道: >>>> Sorry, I've been busy with something else these two days. I don't >>>> think there's anything wrong with the code, >>>> >>>> but I need to test the spec.:-) >>> Hi, Ruoyao: >>> >>> After applying this patch, spec2006 464.h264 ref will have a 6.4% >>> performance drop. So I'm going to retest it. >> I think 6.4% is large enough not to be a random error. >> >> Is there an example showing the code regression? >> >> And I'm wondering if keeping the peephole besides the new >> define_insn_and_split produces a better result instead of solely relying >> on define_insn_and_split? >> > I haven't debugged this yet, I'm retesting, if there is still such a > big performance gap, > > I think I need to see the reason. > The performance drop has nothing to do with this patch. I found that the h264 performance compiled by r14-6787 compared to r14-6421 dropped by 6.4%.
在 2023/12/23 上午10:26, chenglulu 写道: > > 在 2023/12/22 下午3:21, chenglulu 写道: >> >> 在 2023/12/22 下午3:09, Xi Ruoyao 写道: >>> On Fri, 2023-12-22 at 11:44 +0800, chenglulu wrote: >>>> 在 2023/12/21 下午8:00, chenglulu 写道: >>>>> Sorry, I've been busy with something else these two days. I don't >>>>> think there's anything wrong with the code, >>>>> >>>>> but I need to test the spec.:-) >>>> Hi, Ruoyao: >>>> >>>> After applying this patch, spec2006 464.h264 ref will have a 6.4% >>>> performance drop. So I'm going to retest it. >>> I think 6.4% is large enough not to be a random error. >>> >>> Is there an example showing the code regression? >>> >>> And I'm wondering if keeping the peephole besides the new >>> define_insn_and_split produces a better result instead of solely >>> relying >>> on define_insn_and_split? >>> >> I haven't debugged this yet, I'm retesting, if there is still such a >> big performance gap, >> >> I think I need to see the reason. >> > The performance drop has nothing to do with this patch. I found that > the h264 performance compiled > > by r14-6787 compared to r14-6421 dropped by 6.4%. > > But there is a problem. My regression test has the following two fail items.(based on r14-6787) +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6
On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled > > by r14-6787 compared to r14-6421 dropped by 6.4%. Then I guess we should create a bug report... > But there is a problem. My regression test has the following two fail items.(based on r14-6787) > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 Strange. I didn't see them on r14-6650 (with or without the patch).
On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote: > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: > > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled > > > by r14-6787 compared to r14-6421 dropped by 6.4%. > > Then I guess we should create a bug report... > > > But there is a problem. My regression test has the following two fail items.(based on r14-6787) > > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) I guess this is https://gcc.gnu.org/PR28123. > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 I'll take a look on this. Maybe it will show up with Binutils trunk (I just realized I tested this patch with Binutils 2.41, and it's not sufficient to really test the change). > Strange. I didn't see them on r14-6650 (with or without the patch).
On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote: > On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote: > > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: > > > > The performance drop has nothing to do with this patch. I found that the h264 performance compiled > > > > by r14-6787 compared to r14-6421 dropped by 6.4%. > > > > Then I guess we should create a bug report... > > > > > But there is a problem. My regression test has the following two fail items.(based on r14-6787) > > > > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) > > I guess this is https://gcc.gnu.org/PR28123. > > > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 > > I'll take a look on this. Maybe it will show up with Binutils trunk (I > just realized I tested this patch with Binutils 2.41, and it's not > sufficient to really test the change). I cannot reproduce the issue on a Gentoo dev machine with Binutils 2.41.50.20231218 and the patch on top of r14-6819. And in my manual testing (for ruling out the difference caused by default PIE and SSP) the test also passes: xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc- build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git- repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector -fno-pie && grep -c mem/v pr86617.c.348r.final 6 Could you recheck with latest GCC master?
在 2023/12/24 下午8:59, Xi Ruoyao 写道: > On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote: >> On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote: >>> On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: >>>>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled >>>>> by r14-6787 compared to r14-6421 dropped by 6.4%. >>> Then I guess we should create a bug report... >>> >>>> But there is a problem. My regression test has the following two fail items.(based on r14-6787) >>>> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) >> I guess this is https://gcc.gnu.org/PR28123. >> >>>> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 >> I'll take a look on this. Maybe it will show up with Binutils trunk (I >> just realized I tested this patch with Binutils 2.41, and it's not >> sufficient to really test the change). > I cannot reproduce the issue on a Gentoo dev machine with Binutils > 2.41.50.20231218 and the patch on top of r14-6819. And in my manual > testing (for ruling out the difference caused by default PIE and SSP) > the test also passes: > > xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc- > build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ /home/xry111/git- > repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output -Os > -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack-protector > -fno-pie && grep -c mem/v pr86617.c.348r.final > 6 > > Could you recheck with latest GCC master? Ok, I'll test again with the latest code. >
On Mon, 2023-12-25 at 10:08 +0800, chenglulu wrote: > > 在 2023/12/24 下午8:59, Xi Ruoyao 写道: > > On Sat, 2023-12-23 at 18:47 +0800, Xi Ruoyao wrote: > > > On Sat, 2023-12-23 at 18:44 +0800, Xi Ruoyao wrote: > > > > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: > > > > > > The performance drop has nothing to do with this patch. I > > > > > > found that the h264 performance compiled > > > > > > by r14-6787 compared to r14-6421 dropped by 6.4%. > > > > Then I guess we should create a bug report... > > > > > > > > > But there is a problem. My regression test has the following > > > > > two fail items.(based on r14-6787) > > > > > +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) > > > I guess this is https://gcc.gnu.org/PR28123. > > > > > > > > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 > > > I'll take a look on this. Maybe it will show up with Binutils > > > trunk (I > > > just realized I tested this patch with Binutils 2.41, and it's not > > > sufficient to really test the change). > > I cannot reproduce the issue on a Gentoo dev machine with Binutils > > 2.41.50.20231218 and the patch on top of r14-6819. And in my manual > > testing (for ruling out the difference caused by default PIE and > > SSP) > > the test also passes: > > > > xry111@nanmen2 ~/git-repos/gcc-build $ /home/xry111/git-repos/gcc- > > build/gcc/xgcc -B/home/xry111/git-repos/gcc-build/gcc/ > > /home/xry111/git- > > repos/gcc/gcc/testsuite/gcc.dg/pr86617.c -fdiagnostics-plain-output > > -Os > > -fdump-rtl-final -ffat-lto-objects -S -o pr86617.s -fno-stack- > > protector > > -fno-pie && grep -c mem/v pr86617.c.348r.final > > 6 > > > > Could you recheck with latest GCC master? > Ok, I'll test again with the latest code. Per https://gcc.gnu.org/pipermail/gcc-patches/2023-December/641407.html I need to and "&& true" into the split condition. I'll test it and send V2.
在 2023/12/23 下午6:44, Xi Ruoyao 写道: > On Sat, 2023-12-23 at 10:29 +0800, chenglulu wrote: >>> The performance drop has nothing to do with this patch. I found that the h264 performance compiled >>> by r14-6787 compared to r14-6421 dropped by 6.4%. > Then I guess we should create a bug report... The code h264 score in r14-6818 is the same as that of r14-6421. > >> But there is a problem. My regression test has the following two fail items.(based on r14-6787) >> +FAIL: gcc.dg/cpp/_Pragma3.c (test for excess errors) >> +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 > Strange. I didn't see them on r14-6650 (with or without the patch). > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this: volatile unsigned char u8; void test (void) { u8 = u8 + u8; u8 = u8 - u8; } $./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output -Os -fdump-rtl-final -ffat-lto-objects test.c.301r.outof_cfglayout (insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ]) (zero_extend:DI*(mem/v/c*:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi} (nil)) test.c.302r.split1 (insn 27 6 28 2 (set (reg:DI 98) (unspec:DI [ (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) ] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1 (nil)) (insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ]) (zero_extend:DI*(mem:*QI (lo_sum:DI (reg:DI 98) (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0 S1 A8]))) "volatile.c":5:11 -1 (nil)) The volatile property of the mem here is gone, so the test fails.
On Wed, 2023-12-27 at 11:59 +0800, chenglulu wrote: > +FAIL: gcc.dg/pr86617.c scan-rtl-dump-times final "mem/v" 6 > > In r14-6818 the issue persists. I kind of chased the code and found that the problem is like this: > volatile unsigned char u8; > > void test (void) > { > u8 = u8 + u8; > u8 = u8 - u8; > } > > $./gcc/cc1 test.c -o test.s -fdump-rtl-all-all -fdiagnostics-plain-output -Os -fdump-rtl-final -ffat-lto-objects > > test.c.301r.outof_cfglayout > > (insn 7 6 9 2 (set (reg:DI 80 [ u8.0_1 ]) > (zero_extend:DI (mem/v/c:QI (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) [0 u8D.2193+0 S1 A8]))) "volatile.c":5:11 459 {simple_load_uextdiqidi} > (nil)) > > test.c.302r.split1 > > (insn 27 6 28 2 (set (reg:DI 98) > (unspec:DI [ > (symbol_ref:DI ("*.LANCHOR0") [flags 0x182]) > ] UNSPEC_PCALAU12I_GR)) "volatile.c":5:11 -1 > (nil)) > (insn 28 27 9 2 (set (reg:DI 80 [ u8.0_1 ]) > (zero_extend:DI (mem:QI (lo_sum:DI (reg:DI 98) > (symbol_ref:DI ("*.LANCHOR0") [flags 0x182])) [0 S1 A8]))) "volatile.c":5:11 -1 > (nil)) > > The volatile property of the mem here is gone, so the test fails. Phew. I guess I couldn't reproduce it because I have Jeff's ext-dce patch in my local repo, which removed the zero_extend... I'll rework this patch.
diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index 7b26d15aa4e..4009de408fb 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -4033,101 +4033,110 @@ (define_insn "loongarch_crcc_w_<size>_w" ;; ;; And if the pseudo op cannot be relaxed, we'll get a worse result (with ;; 3 instructions). -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") - (mem:LD_AT_LEAST_32_BIT (match_dup 0)))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0]) \ - || REGNO (operands[0]) == REGNO (operands[2]))" - [(set (match_dup 2) - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] +(define_insn_and_split "simple_load<P:mode><LD_AT_LEAST_32_BIT:mode>" + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") + (mem:LD_AT_LEAST_32_BIT + (match_operand:P 1 "symbolic_pcrel_operand" "")))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (match_dup 0) + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] { - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + operands[2] = gen_reg_rtx (Pmode); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); }) -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (match_operand:LD_AT_LEAST_32_BIT 2 "register_operand") - (mem:LD_AT_LEAST_32_BIT (plus (match_dup 0) - (match_operand 3 "const_int_operand"))))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0]) \ - || REGNO (operands[0]) == REGNO (operands[2]))" - [(set (match_dup 2) - (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 0) (match_dup 1))))] +(define_insn_and_split "simple_load_off<P:mode><LD_AT_LEAST_32_BIT:mode>" + [(set (match_operand:LD_AT_LEAST_32_BIT 0 "register_operand" "=r,f") + (mem:LD_AT_LEAST_32_BIT + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") + (match_operand 2 "const_int_operand" ""))))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (match_dup 0) + (mem:LD_AT_LEAST_32_BIT (lo_sum:P (match_dup 2) (match_dup 1))))] { - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + HOST_WIDE_INT offset = INTVAL (operands[2]); + operands[2] = gen_reg_rtx (Pmode); + operands[1] = plus_constant (Pmode, operands[1], offset); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); }) -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (match_operand:GPR 2 "register_operand") - (any_extend:GPR (mem:SUBDI (match_dup 0))))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0]) \ - || REGNO (operands[0]) == REGNO (operands[2]))" - [(set (match_dup 2) - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) - (match_dup 1)))))] +(define_insn_and_split "simple_load_<su>ext<P:mode><SUBDI:mode><GPR:mode>" + [(set (match_operand:GPR 0 "register_operand" "=r") + (any_extend:GPR + (mem:SUBDI (match_operand:P 1 "symbolic_pcrel_operand" ""))))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (match_dup 0) + (any_extend:GPR + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] { - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + operands[2] = gen_reg_rtx (Pmode); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); }) -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (match_operand:GPR 2 "register_operand") +(define_insn_and_split + "simple_load_off_<su>ext<P:mode><SUBDI:mode><GPR:mode>" + [(set (match_operand:GPR 0 "register_operand" "=r") + (any_extend:GPR + (mem:SUBDI + (plus (match_operand:P 1 "symbolic_pcrel_operand" "") + (match_operand 2 "const_int_operand" "")))))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (match_dup 0) (any_extend:GPR - (mem:SUBDI (plus (match_dup 0) - (match_operand 3 "const_int_operand")))))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0]) \ - || REGNO (operands[0]) == REGNO (operands[2]))" - [(set (match_dup 2) - (any_extend:GPR (mem:SUBDI (lo_sum:P (match_dup 0) - (match_dup 1)))))] + (mem:SUBDI (lo_sum:P (match_dup 2) (match_dup 1)))))] { - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + HOST_WIDE_INT offset = INTVAL (operands[2]); + operands[2] = gen_reg_rtx (Pmode); + operands[1] = plus_constant (Pmode, operands[1], offset); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[1])); }) -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (mem:ST_ANY (match_dup 0)) - (match_operand:ST_ANY 2 "register_operand"))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0])) \ - && REGNO (operands[0]) != REGNO (operands[2])" - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] +(define_insn_and_split "simple_store<ST_ANY:mode><P:mode>" + [(set (mem:ST_ANY (match_operand:P 0 "symbolic_pcrel_operand")) + (match_operand:ST_ANY 1 "register_operand" "r,f"))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (mem:ST_ANY (lo_sum:P (match_dup 2) (match_dup 0))) (match_dup 1))] { - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + operands[2] = gen_reg_rtx (Pmode); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[2], operands[0])); }) -(define_peephole2 - [(set (match_operand:P 0 "register_operand") - (match_operand:P 1 "symbolic_pcrel_operand")) - (set (mem:ST_ANY (plus (match_dup 0) - (match_operand 3 "const_int_operand"))) - (match_operand:ST_ANY 2 "register_operand"))] - "la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ - && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM) \ - && (peep2_reg_dead_p (2, operands[0])) \ - && REGNO (operands[0]) != REGNO (operands[2])" - [(set (mem:ST_ANY (lo_sum:P (match_dup 0) (match_dup 1))) (match_dup 2))] +(define_insn_and_split "simple_store_off<ST_ANY:mode><P:mode>" + [(set (mem:ST_ANY + (plus (match_operand:P 0 "symbolic_pcrel_operand" "") + (match_operand 1 "const_int_operand" ""))) + (match_operand:ST_ANY 2 "register_operand" "r,f"))] + "loongarch_pre_reload_split () \ + && la_opt_explicit_relocs == EXPLICIT_RELOCS_AUTO \ + && (TARGET_CMODEL_NORMAL || TARGET_CMODEL_MEDIUM)" + "#" + "" + [(set (mem:ST_ANY (lo_sum:P (match_dup 1) (match_dup 0))) (match_dup 2))] { - operands[1] = plus_constant (Pmode, operands[1], INTVAL (operands[3])); - emit_insn (gen_pcalau12i_gr<P:mode> (operands[0], operands[1])); + HOST_WIDE_INT offset = INTVAL (operands[1]); + operands[1] = gen_reg_rtx (Pmode); + operands[0] = plus_constant (Pmode, operands[0], offset); + emit_insn (gen_pcalau12i_gr<P:mode> (operands[1], operands[0])); }) ;; Synchronization instructions. diff --git a/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c new file mode 100644 index 00000000000..42cb966d1e0 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/explicit-relocs-auto-single-load-store-2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=loongarch64 -mabi=lp64d -mexplicit-relocs=auto" } */ + +float a[8001]; +float +t (void) +{ + return a[0] + a[8000]; +} + +/* { dg-final { scan-assembler-not "la.local" } } */