Message ID | 20230412121648.1394569-1-xry111@xry111.site |
---|---|
State | New |
Headers | show |
Series | [GCC14] LoongArch: Improve cpymemsi expansion [PR109465] | expand |
在 2023/4/12 下午8:16, Xi Ruoyao 写道: > We'd been generating really bad block move sequences which is recently > complained by kernel developers who tried __builtin_memcpy. To improve > it: > > 1. Take the advantage of -mno-strict-align. When it is set, set mode > size to UNITS_PER_WORD regardless of the alignment. > 2. Half the mode size when (block size) % (mode size) != 0, instead of > falling back to ld.bu/st.b at once. > 3. Limit the length of block move sequence considering the number of > instructions, not the size of block. When -mstrict-align is set and > the block is not aligned, the old size limit for straight-line > implementation (64 bytes) was definitely too large (we don't have 64 > registers anyway). > > Bootstrapped and regtested on loongarch64-linux-gnu. Ok for GCC 14? /* snip */ > > static void > -loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) > +loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, > + HOST_WIDE_INT delta) > { > - HOST_WIDE_INT offset, delta; > - unsigned HOST_WIDE_INT bits; > + HOST_WIDE_INT offs, delta_cur; > int i; > machine_mode mode; > rtx *regs; > > - bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))); > - > - mode = int_mode_for_size (bits, 0).require (); > - delta = bits / BITS_PER_UNIT; > + HOST_WIDE_INT num_reg = length / delta; I think comments need to be added here, if it is not chasing the code, it is not easy to understand. Otherwise LGTM! Thanks! > + for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2) > + num_reg += !!(length & delta_cur); > > /* Allocate a buffer for the temporary registers. */ > - regs = XALLOCAVEC (rtx, length / delta); > + regs = XALLOCAVEC (rtx, num_reg); > > - /* Load as many BITS-sized chunks as possible. Use a normal load if > - the source has enough alignment, otherwise use left/right pairs. */ > - for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) > { > - regs[i] = gen_reg_rtx (mode); > - loongarch_emit_move (regs[i], adjust_address (src, mode, offset)); > - } > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); > > - for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) > - loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]); > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > + { > + regs[i] = gen_reg_rtx (mode); > + loongarch_emit_move (regs[i], adjust_address (src, mode, offs)); > + } > + } > > - /* Mop up any left-over bytes. */ > - if (offset < length) > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) > { > - src = adjust_address (src, BLKmode, offset); > - dest = adjust_address (dest, BLKmode, offset); > - move_by_pieces (dest, src, length - offset, > - MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), > - (enum memop_ret) 0); > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); > + > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > + loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]); > } > } > > @@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg, > > static void > loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, > - HOST_WIDE_INT bytes_per_iter) > + HOST_WIDE_INT align) > { > rtx_code_label *label; > rtx src_reg, dest_reg, final_src, test; > + HOST_WIDE_INT bytes_per_iter = align * LARCH_MAX_MOVE_OPS_PER_LOOP_ITER; > HOST_WIDE_INT leftover; > > leftover = length % bytes_per_iter; > @@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, > emit_label (label); > > /* Emit the loop body. */ > - loongarch_block_move_straight (dest, src, bytes_per_iter); > + loongarch_block_move_straight (dest, src, bytes_per_iter, align); > > /* Move on to the next block. */ > loongarch_emit_move (src_reg, > @@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, > > /* Mop up any left-over bytes. */ > if (leftover) > - loongarch_block_move_straight (dest, src, leftover); > + loongarch_block_move_straight (dest, src, leftover, align); > else > /* Temporary fix for PR79150. */ > emit_insn (gen_nop ()); > @@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, > memory reference SRC to memory reference DEST. */ > > bool > -loongarch_expand_block_move (rtx dest, rtx src, rtx length) > +loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align) > { > - int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT; > + if (!CONST_INT_P (r_length)) > + return false; > + > + HOST_WIDE_INT length = INTVAL (r_length); > + if (length > loongarch_max_inline_memcpy_size) > + return false; > + > + HOST_WIDE_INT align = INTVAL (r_align); > + > + if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) > + align = UNITS_PER_WORD; > > - if (CONST_INT_P (length) > - && INTVAL (length) <= loongarch_max_inline_memcpy_size) > + if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) > { > - if (INTVAL (length) <= max_move_bytes) > - { > - loongarch_block_move_straight (dest, src, INTVAL (length)); > - return true; > - } > - else if (optimize) > - { > - loongarch_block_move_loop (dest, src, INTVAL (length), > - LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER); > - return true; > - } > + loongarch_block_move_straight (dest, src, length, align); > + return true; > + } > + > + if (optimize) > + { > + loongarch_block_move_loop (dest, src, length, align); > + return true; > } > + > return false; > } > > diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h > index 7151d5cabb3..1bcd144a5d9 100644 > --- a/gcc/config/loongarch/loongarch.h > +++ b/gcc/config/loongarch/loongarch.h > @@ -1063,13 +1063,13 @@ typedef struct { > > /* The maximum number of bytes that can be copied by one iteration of > a cpymemsi loop; see loongarch_block_move_loop. */ > -#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4) > +#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4 > > /* The maximum number of bytes that can be copied by a straight-line > implementation of cpymemsi; see loongarch_block_move_straight. We want > to make sure that any loop-based implementation will iterate at > least twice. */ > -#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2) > +#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) > > /* The base cost of a memcpy call, for MOVE_RATIO and friends. These > values were determined experimentally by benchmarking with CSiBE. > @@ -1077,7 +1077,7 @@ typedef struct { > #define LARCH_CALL_RATIO 8 > > /* Any loop-based implementation of cpymemsi will have at least > - LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory > + LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory > moves, so allow individual copies of fewer elements. > > When cpymemsi is not available, use a value approximating > @@ -1088,9 +1088,7 @@ typedef struct { > value of LARCH_CALL_RATIO to take that into account. */ > > #define MOVE_RATIO(speed) \ > - (HAVE_cpymemsi \ > - ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \ > - : CLEAR_RATIO (speed) / 2) > + (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2) > > /* For CLEAR_RATIO, when optimizing for size, give a better estimate > of the length of a memset call, but use the default otherwise. */ > diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md > index 628ecc78088..816a943d155 100644 > --- a/gcc/config/loongarch/loongarch.md > +++ b/gcc/config/loongarch/loongarch.md > @@ -2488,7 +2488,8 @@ (define_expand "cpymemsi" > "" > { > if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P > - && loongarch_expand_block_move (operands[0], operands[1], operands[2])) > + && loongarch_expand_block_move (operands[0], operands[1], > + operands[2], operands[3])) > DONE; > else > FAIL; > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > new file mode 100644 > index 00000000000..4cd35d13904 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */ > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > + > +extern char a[], b[]; > +void test() { __builtin_memcpy(a, b, 15); } > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > new file mode 100644 > index 00000000000..703eb951c6d > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > + > +extern long a[], b[]; > +void test() { __builtin_memcpy(a, b, 15); } > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > new file mode 100644 > index 00000000000..d6a80659b31 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > + > +/* Three loop iterations each contains 4 st.b, and 3 st.b after the loop */ > +/* { dg-final { scan-assembler-times "st\\.b" 7 } } */ > + > +/* { dg-final { scan-assembler-not "st\\.h" } } */ > +/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */ > +/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */ > + > +extern char a[], b[]; > +void test() { __builtin_memcpy(a, b, 15); }
On Wed, 2023-04-19 at 11:03 +0800, Lulu Cheng wrote: /* snip */ > > +loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT > > length, > > + HOST_WIDE_INT delta) > > { > > - HOST_WIDE_INT offset, delta; > > - unsigned HOST_WIDE_INT bits; > > + HOST_WIDE_INT offs, delta_cur; > > int i; > > machine_mode mode; > > rtx *regs; > > > > - bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN > > (dest))); > > - > > - mode = int_mode_for_size (bits, 0).require (); > > - delta = bits / BITS_PER_UNIT; > > + HOST_WIDE_INT num_reg = length / delta; > > I think comments need to be added here, if it is not chasing the code, > it is not easy to understand. Pushed r14-70 with the following addition: + /* Calculate how many registers we'll need for the block move. + We'll emit length / delta move operations with delta as the size + first. Then we may still have length % delta bytes not copied. + We handle these remaining bytes by move operations with smaller + (halfed) sizes. For example, if length = 21 and delta = 8, we'll + emit two ld.d/st.d pairs, one ld.w/st.w pair, and one ld.b/st.b + pair. For each load/store pair we use a dedicated register to keep + the pipeline as populated as possible. */ > Otherwise LGTM! > > Thanks! > > > + for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2) > > + num_reg += !!(length & delta_cur); > > > > /* Allocate a buffer for the temporary registers. */ > > - regs = XALLOCAVEC (rtx, length / delta); > > + regs = XALLOCAVEC (rtx, num_reg); > > > > - /* Load as many BITS-sized chunks as possible. Use a normal load > > if > > - the source has enough alignment, otherwise use left/right > > pairs. */ > > - for (offset = 0, i = 0; offset + delta <= length; offset += > > delta, i++) > > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur > > /= 2) > > { > > - regs[i] = gen_reg_rtx (mode); > > - loongarch_emit_move (regs[i], adjust_address (src, mode, > > offset)); > > - } > > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, > > 0).require (); > > > > - for (offset = 0, i = 0; offset + delta <= length; offset += > > delta, i++) > > - loongarch_emit_move (adjust_address (dest, mode, offset), > > regs[i]); > > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > > + { > > + regs[i] = gen_reg_rtx (mode); > > + loongarch_emit_move (regs[i], adjust_address (src, mode, > > offs)); > > + } > > + } > > > > - /* Mop up any left-over bytes. */ > > - if (offset < length) > > + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur > > /= 2) > > { > > - src = adjust_address (src, BLKmode, offset); > > - dest = adjust_address (dest, BLKmode, offset); > > - move_by_pieces (dest, src, length - offset, > > - MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), > > - (enum memop_ret) 0); > > + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, > > 0).require (); > > + > > + for (; offs + delta_cur <= length; offs += delta_cur, i++) > > + loongarch_emit_move (adjust_address (dest, mode, offs), > > regs[i]); > > } > > } > > > > @@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem, > > HOST_WIDE_INT length, rtx *loop_reg, > > > > static void > > loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT > > length, > > - HOST_WIDE_INT bytes_per_iter) > > + HOST_WIDE_INT align) > > { > > rtx_code_label *label; > > rtx src_reg, dest_reg, final_src, test; > > + HOST_WIDE_INT bytes_per_iter = align * > > LARCH_MAX_MOVE_OPS_PER_LOOP_ITER; > > HOST_WIDE_INT leftover; > > > > leftover = length % bytes_per_iter; > > @@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src, > > HOST_WIDE_INT length, > > emit_label (label); > > > > /* Emit the loop body. */ > > - loongarch_block_move_straight (dest, src, bytes_per_iter); > > + loongarch_block_move_straight (dest, src, bytes_per_iter, align); > > > > /* Move on to the next block. */ > > loongarch_emit_move (src_reg, > > @@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src, > > HOST_WIDE_INT length, > > > > /* Mop up any left-over bytes. */ > > if (leftover) > > - loongarch_block_move_straight (dest, src, leftover); > > + loongarch_block_move_straight (dest, src, leftover, align); > > else > > /* Temporary fix for PR79150. */ > > emit_insn (gen_nop ()); > > @@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx > > src, HOST_WIDE_INT length, > > memory reference SRC to memory reference DEST. */ > > > > bool > > -loongarch_expand_block_move (rtx dest, rtx src, rtx length) > > +loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx > > r_align) > > { > > - int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT; > > + if (!CONST_INT_P (r_length)) > > + return false; > > + > > + HOST_WIDE_INT length = INTVAL (r_length); > > + if (length > loongarch_max_inline_memcpy_size) > > + return false; > > + > > + HOST_WIDE_INT align = INTVAL (r_align); > > + > > + if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) > > + align = UNITS_PER_WORD; > > > > - if (CONST_INT_P (length) > > - && INTVAL (length) <= loongarch_max_inline_memcpy_size) > > + if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) > > { > > - if (INTVAL (length) <= max_move_bytes) > > - { > > - loongarch_block_move_straight (dest, src, INTVAL > > (length)); > > - return true; > > - } > > - else if (optimize) > > - { > > - loongarch_block_move_loop (dest, src, INTVAL (length), > > - > > LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER); > > - return true; > > - } > > + loongarch_block_move_straight (dest, src, length, align); > > + return true; > > + } > > + > > + if (optimize) > > + { > > + loongarch_block_move_loop (dest, src, length, align); > > + return true; > > } > > + > > return false; > > } > > > > diff --git a/gcc/config/loongarch/loongarch.h > > b/gcc/config/loongarch/loongarch.h > > index 7151d5cabb3..1bcd144a5d9 100644 > > --- a/gcc/config/loongarch/loongarch.h > > +++ b/gcc/config/loongarch/loongarch.h > > @@ -1063,13 +1063,13 @@ typedef struct { > > > > /* The maximum number of bytes that can be copied by one iteration > > of > > a cpymemsi loop; see loongarch_block_move_loop. */ > > -#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4) > > +#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4 > > > > /* The maximum number of bytes that can be copied by a straight- > > line > > implementation of cpymemsi; see loongarch_block_move_straight. > > We want > > to make sure that any loop-based implementation will iterate at > > least twice. */ > > -#define LARCH_MAX_MOVE_BYTES_STRAIGHT > > (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2) > > +#define LARCH_MAX_MOVE_OPS_STRAIGHT > > (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) > > > > /* The base cost of a memcpy call, for MOVE_RATIO and friends. > > These > > values were determined experimentally by benchmarking with > > CSiBE. > > @@ -1077,7 +1077,7 @@ typedef struct { > > #define LARCH_CALL_RATIO 8 > > > > /* Any loop-based implementation of cpymemsi will have at least > > - LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory > > + LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory > > moves, so allow individual copies of fewer elements. > > > > When cpymemsi is not available, use a value approximating > > @@ -1088,9 +1088,7 @@ typedef struct { > > value of LARCH_CALL_RATIO to take that into account. */ > > > > #define MOVE_RATIO(speed) \ > > - (HAVE_cpymemsi \ > > - ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \ > > - : CLEAR_RATIO (speed) / 2) > > + (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO > > (speed) / 2) > > > > /* For CLEAR_RATIO, when optimizing for size, give a better > > estimate > > of the length of a memset call, but use the default otherwise. > > */ > > diff --git a/gcc/config/loongarch/loongarch.md > > b/gcc/config/loongarch/loongarch.md > > index 628ecc78088..816a943d155 100644 > > --- a/gcc/config/loongarch/loongarch.md > > +++ b/gcc/config/loongarch/loongarch.md > > @@ -2488,7 +2488,8 @@ (define_expand "cpymemsi" > > "" > > { > > if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P > > - && loongarch_expand_block_move (operands[0], operands[1], > > operands[2])) > > + && loongarch_expand_block_move (operands[0], operands[1], > > + operands[2], operands[3])) > > DONE; > > else > > FAIL; > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > new file mode 100644 > > index 00000000000..4cd35d13904 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c > > @@ -0,0 +1,9 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */ > > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > > + > > +extern char a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > new file mode 100644 > > index 00000000000..703eb951c6d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c > > @@ -0,0 +1,9 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > > +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ > > +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ > > + > > +extern long a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } > > diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > new file mode 100644 > > index 00000000000..d6a80659b31 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c > > @@ -0,0 +1,12 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ > > + > > +/* Three loop iterations each contains 4 st.b, and 3 st.b after the > > loop */ > > +/* { dg-final { scan-assembler-times "st\\.b" 7 } } */ > > + > > +/* { dg-final { scan-assembler-not "st\\.h" } } */ > > +/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */ > > +/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */ > > + > > +extern char a[], b[]; > > +void test() { __builtin_memcpy(a, b, 15); } >
diff --git a/gcc/config/loongarch/loongarch-protos.h b/gcc/config/loongarch/loongarch-protos.h index 83df489c7a5..b71b188507a 100644 --- a/gcc/config/loongarch/loongarch-protos.h +++ b/gcc/config/loongarch/loongarch-protos.h @@ -95,7 +95,7 @@ extern void loongarch_expand_conditional_trap (rtx); #endif extern void loongarch_set_return_address (rtx, rtx); extern bool loongarch_move_by_pieces_p (unsigned HOST_WIDE_INT, unsigned int); -extern bool loongarch_expand_block_move (rtx, rtx, rtx); +extern bool loongarch_expand_block_move (rtx, rtx, rtx, rtx); extern bool loongarch_do_optimize_block_move_p (void); extern bool loongarch_expand_ext_as_unaligned_load (rtx, rtx, HOST_WIDE_INT, diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc index dfb731fca9d..06fc1cd0604 100644 --- a/gcc/config/loongarch/loongarch.cc +++ b/gcc/config/loongarch/loongarch.cc @@ -4459,41 +4459,38 @@ loongarch_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, Assume that the areas do not overlap. */ static void -loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length) +loongarch_block_move_straight (rtx dest, rtx src, HOST_WIDE_INT length, + HOST_WIDE_INT delta) { - HOST_WIDE_INT offset, delta; - unsigned HOST_WIDE_INT bits; + HOST_WIDE_INT offs, delta_cur; int i; machine_mode mode; rtx *regs; - bits = MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))); - - mode = int_mode_for_size (bits, 0).require (); - delta = bits / BITS_PER_UNIT; + HOST_WIDE_INT num_reg = length / delta; + for (delta_cur = delta / 2; delta_cur != 0; delta_cur /= 2) + num_reg += !!(length & delta_cur); /* Allocate a buffer for the temporary registers. */ - regs = XALLOCAVEC (rtx, length / delta); + regs = XALLOCAVEC (rtx, num_reg); - /* Load as many BITS-sized chunks as possible. Use a normal load if - the source has enough alignment, otherwise use left/right pairs. */ - for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - regs[i] = gen_reg_rtx (mode); - loongarch_emit_move (regs[i], adjust_address (src, mode, offset)); - } + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); - for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++) - loongarch_emit_move (adjust_address (dest, mode, offset), regs[i]); + for (; offs + delta_cur <= length; offs += delta_cur, i++) + { + regs[i] = gen_reg_rtx (mode); + loongarch_emit_move (regs[i], adjust_address (src, mode, offs)); + } + } - /* Mop up any left-over bytes. */ - if (offset < length) + for (delta_cur = delta, i = 0, offs = 0; offs < length; delta_cur /= 2) { - src = adjust_address (src, BLKmode, offset); - dest = adjust_address (dest, BLKmode, offset); - move_by_pieces (dest, src, length - offset, - MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), - (enum memop_ret) 0); + mode = int_mode_for_size (delta_cur * BITS_PER_UNIT, 0).require (); + + for (; offs + delta_cur <= length; offs += delta_cur, i++) + loongarch_emit_move (adjust_address (dest, mode, offs), regs[i]); } } @@ -4523,10 +4520,11 @@ loongarch_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg, static void loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, - HOST_WIDE_INT bytes_per_iter) + HOST_WIDE_INT align) { rtx_code_label *label; rtx src_reg, dest_reg, final_src, test; + HOST_WIDE_INT bytes_per_iter = align * LARCH_MAX_MOVE_OPS_PER_LOOP_ITER; HOST_WIDE_INT leftover; leftover = length % bytes_per_iter; @@ -4546,7 +4544,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, emit_label (label); /* Emit the loop body. */ - loongarch_block_move_straight (dest, src, bytes_per_iter); + loongarch_block_move_straight (dest, src, bytes_per_iter, align); /* Move on to the next block. */ loongarch_emit_move (src_reg, @@ -4563,7 +4561,7 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, /* Mop up any left-over bytes. */ if (leftover) - loongarch_block_move_straight (dest, src, leftover); + loongarch_block_move_straight (dest, src, leftover, align); else /* Temporary fix for PR79150. */ emit_insn (gen_nop ()); @@ -4573,25 +4571,32 @@ loongarch_block_move_loop (rtx dest, rtx src, HOST_WIDE_INT length, memory reference SRC to memory reference DEST. */ bool -loongarch_expand_block_move (rtx dest, rtx src, rtx length) +loongarch_expand_block_move (rtx dest, rtx src, rtx r_length, rtx r_align) { - int max_move_bytes = LARCH_MAX_MOVE_BYTES_STRAIGHT; + if (!CONST_INT_P (r_length)) + return false; + + HOST_WIDE_INT length = INTVAL (r_length); + if (length > loongarch_max_inline_memcpy_size) + return false; + + HOST_WIDE_INT align = INTVAL (r_align); + + if (!TARGET_STRICT_ALIGN || align > UNITS_PER_WORD) + align = UNITS_PER_WORD; - if (CONST_INT_P (length) - && INTVAL (length) <= loongarch_max_inline_memcpy_size) + if (length <= align * LARCH_MAX_MOVE_OPS_STRAIGHT) { - if (INTVAL (length) <= max_move_bytes) - { - loongarch_block_move_straight (dest, src, INTVAL (length)); - return true; - } - else if (optimize) - { - loongarch_block_move_loop (dest, src, INTVAL (length), - LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER); - return true; - } + loongarch_block_move_straight (dest, src, length, align); + return true; + } + + if (optimize) + { + loongarch_block_move_loop (dest, src, length, align); + return true; } + return false; } diff --git a/gcc/config/loongarch/loongarch.h b/gcc/config/loongarch/loongarch.h index 7151d5cabb3..1bcd144a5d9 100644 --- a/gcc/config/loongarch/loongarch.h +++ b/gcc/config/loongarch/loongarch.h @@ -1063,13 +1063,13 @@ typedef struct { /* The maximum number of bytes that can be copied by one iteration of a cpymemsi loop; see loongarch_block_move_loop. */ -#define LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER (UNITS_PER_WORD * 4) +#define LARCH_MAX_MOVE_OPS_PER_LOOP_ITER 4 /* The maximum number of bytes that can be copied by a straight-line implementation of cpymemsi; see loongarch_block_move_straight. We want to make sure that any loop-based implementation will iterate at least twice. */ -#define LARCH_MAX_MOVE_BYTES_STRAIGHT (LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER * 2) +#define LARCH_MAX_MOVE_OPS_STRAIGHT (LARCH_MAX_MOVE_OPS_PER_LOOP_ITER * 2) /* The base cost of a memcpy call, for MOVE_RATIO and friends. These values were determined experimentally by benchmarking with CSiBE. @@ -1077,7 +1077,7 @@ typedef struct { #define LARCH_CALL_RATIO 8 /* Any loop-based implementation of cpymemsi will have at least - LARCH_MAX_MOVE_BYTES_STRAIGHT / UNITS_PER_WORD memory-to-memory + LARCH_MAX_MOVE_OPS_PER_LOOP_ITER memory-to-memory moves, so allow individual copies of fewer elements. When cpymemsi is not available, use a value approximating @@ -1088,9 +1088,7 @@ typedef struct { value of LARCH_CALL_RATIO to take that into account. */ #define MOVE_RATIO(speed) \ - (HAVE_cpymemsi \ - ? LARCH_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD \ - : CLEAR_RATIO (speed) / 2) + (HAVE_cpymemsi ? LARCH_MAX_MOVE_OPS_PER_LOOP_ITER : CLEAR_RATIO (speed) / 2) /* For CLEAR_RATIO, when optimizing for size, give a better estimate of the length of a memset call, but use the default otherwise. */ diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md index 628ecc78088..816a943d155 100644 --- a/gcc/config/loongarch/loongarch.md +++ b/gcc/config/loongarch/loongarch.md @@ -2488,7 +2488,8 @@ (define_expand "cpymemsi" "" { if (TARGET_DO_OPTIMIZE_BLOCK_MOVE_P - && loongarch_expand_block_move (operands[0], operands[1], operands[2])) + && loongarch_expand_block_move (operands[0], operands[1], + operands[2], operands[3])) DONE; else FAIL; diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-1.c b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c new file mode 100644 index 00000000000..4cd35d13904 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-1.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mabi=lp64d -mno-strict-align" } */ +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ + +extern char a[], b[]; +void test() { __builtin_memcpy(a, b, 15); } diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-2.c b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c new file mode 100644 index 00000000000..703eb951c6d --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-2.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ +/* { dg-final { scan-assembler-times "st\\.d|stptr\\.d" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.w|stptr\\.w" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.h" 1 } } */ +/* { dg-final { scan-assembler-times "st\\.b" 1 } } */ + +extern long a[], b[]; +void test() { __builtin_memcpy(a, b, 15); } diff --git a/gcc/testsuite/gcc.target/loongarch/pr109465-3.c b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c new file mode 100644 index 00000000000..d6a80659b31 --- /dev/null +++ b/gcc/testsuite/gcc.target/loongarch/pr109465-3.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mabi=lp64d -mstrict-align" } */ + +/* Three loop iterations each contains 4 st.b, and 3 st.b after the loop */ +/* { dg-final { scan-assembler-times "st\\.b" 7 } } */ + +/* { dg-final { scan-assembler-not "st\\.h" } } */ +/* { dg-final { scan-assembler-not "st\\.w|stptr\\.w" } } */ +/* { dg-final { scan-assembler-not "st\\.d|stptr\\.d" } } */ + +extern char a[], b[]; +void test() { __builtin_memcpy(a, b, 15); }