Message ID | VI1PR0801MB20313B74A43EAC4756E8C8BDFFC80@VI1PR0801MB2031.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Tamar Christina <Tamar.Christina@arm.com> writes: > Hi All, > > This patch allows larger bitsizes to be used as copy size > when the target does not have SLOW_UNALIGNED_ACCESS. > > It also provides an optimized routine for MEM to REG > copying which avoid reconstructing the value piecewise on the stack > and instead uses a combination of shifts and ORs. > > This now generates > > adrp x0, .LANCHOR0 > add x0, x0, :lo12:.LANCHOR0 > sub sp, sp, #16 > ldr w1, [x0, 120] > str w1, [sp, 8] > ldr x0, [x0, 112] > ldr x1, [sp, 8] > add sp, sp, 16 > > instead of: > > adrp x3, .LANCHOR0 > add x3, x3, :lo12:.LANCHOR0 > mov x0, 0 > mov x1, 0 > sub sp, sp, #16 > ldr x2, [x3, 112] > ldr w3, [x3, 120] > add sp, sp, 16 > ubfx x5, x2, 8, 8 > bfi x0, x2, 0, 8 > ubfx x4, x2, 16, 8 > lsr w9, w2, 24 > bfi x0, x5, 8, 8 > ubfx x7, x2, 32, 8 > ubfx x5, x2, 40, 8 > ubfx x8, x3, 8, 8 > bfi x0, x4, 16, 8 > bfi x1, x3, 0, 8 > ubfx x4, x2, 48, 8 > ubfx x6, x3, 16, 8 > bfi x0, x9, 24, 8 > bfi x1, x8, 8, 8 > lsr x2, x2, 56 > lsr w3, w3, 24 > bfi x0, x7, 32, 8 > bfi x1, x6, 16, 8 > bfi x0, x5, 40, 8 > bfi x1, x3, 24, 8 > bfi x0, x4, 48, 8 > bfi x0, x2, 56, 8 > > To load a 12 1-byte element struct. Nice! [...] > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands) > base = copy_to_mode_reg (Pmode, XEXP (src, 0)); > src = adjust_automodify_address (src, VOIDmode, base, 0); > > + /* Optimize routines for MEM to REG copies. */ > + if (n < 8 && !REG_P (XEXP (operands[0], 0))) This seems to be checking that the address of the original destination memory isn't a plain base register. Why's it important to reject that case but allow e.g. base+offset? > + { > + unsigned int max_align = UINTVAL (operands[2]); > + max_align = n < max_align ? max_align : n; Might be misunderstanding, but isn't max_align always equal to n here, since n was set by: n = UINTVAL (operands[2]); Indentation of the enclosing { ... } is slightly off. > + machine_mode mov_mode, dest_mode > + = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT); > + rtx result = gen_reg_rtx (dest_mode); > + emit_insn (gen_move_insn (result, GEN_INT (0))); > + > + unsigned int shift_cnt = 0; > + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode)) > + { > + int nearest = 0; > + /* Find the mode to use, but limit the max to TI mode. */ > + for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2) > + nearest = max; In the if statement above, you required n < 8, so can max ever by > 16 here? > + mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT); > + rtx reg = gen_reg_rtx (mov_mode); > + > + src = adjust_address (src, mov_mode, 0); > + emit_insn (gen_move_insn (reg, src)); > + src = aarch64_progress_pointer (src); > + > + reg = gen_rtx_ASHIFT (dest_mode, reg, > + GEN_INT (shift_cnt * BITS_PER_UNIT)); This seems to be mixing modes: reg has mode "mov_mode" but the result has mode "dest_mode". That isn't well-formed: the mode of a shift result needs to be the same as the mode of the operand. I think the load would need to be a zero-extend of "src" into "reg", with "reg" having mode "dest_mode". > + result = gen_rtx_IOR (dest_mode, reg, result); > + } > + > + dst = adjust_address (dst, dest_mode, 0); > + emit_insn (gen_move_insn (dst, result)); dest_mode was chosen by smallest_mode_for_size, so can be bigger than n. Doesn't that mean that we'll write beyond the end of the copy region when n is an awkward number? > diff --git a/gcc/expr.c b/gcc/expr.c > index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644 > --- a/gcc/expr.c > +++ b/gcc/expr.c > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src) > > n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; > dst_words = XALLOCAVEC (rtx, n_regs); > - bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); > + bitsize = BITS_PER_WORD; > + if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src)))) > + bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); I think this ought to be testing word_mode instead of BLKmode. (Testing BLKmode doesn't really make sense in general, because the mode doesn't have a meaningful alignment.) Thanks, Richard
> > + /* Optimize routines for MEM to REG copies. */ if (n < 8 && > > + !REG_P (XEXP (operands[0], 0))) > > This seems to be checking that the address of the original destination > memory isn't a plain base register. Why's it important to reject that case but > allow e.g. base+offset? > In the case of e.g. void Fun3(struct struct3 foo3) { L3 = foo3; } > > + { > > + unsigned int max_align = UINTVAL (operands[2]); > > + max_align = n < max_align ? max_align : n; > > Might be misunderstanding, but isn't max_align always equal to n here, since > n was set by: > > n = UINTVAL (operands[2]); > > Indentation of the enclosing { ... } is slightly off. > > > + machine_mode mov_mode, dest_mode > > + = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT); > > + rtx result = gen_reg_rtx (dest_mode); > > + emit_insn (gen_move_insn (result, GEN_INT (0))); > > + > > + unsigned int shift_cnt = 0; > > + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode)) > > + { > > + int nearest = 0; > > + /* Find the mode to use, but limit the max to TI mode. */ > > + for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= > 2) > > + nearest = max; > > In the if statement above, you required n < 8, so can max ever by > 16 here? > > > + mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, > MODE_INT); > > + rtx reg = gen_reg_rtx (mov_mode); > > + > > + src = adjust_address (src, mov_mode, 0); > > + emit_insn (gen_move_insn (reg, src)); > > + src = aarch64_progress_pointer (src); > > + > > + reg = gen_rtx_ASHIFT (dest_mode, reg, > > + GEN_INT (shift_cnt * BITS_PER_UNIT)); > > This seems to be mixing modes: reg has mode "mov_mode" but the result > has mode "dest_mode". That isn't well-formed: the mode of a shift result > needs to be the same as the mode of the operand. I think the load would > need to be a zero-extend of "src" into "reg", with "reg" having mode > "dest_mode". > > > + result = gen_rtx_IOR (dest_mode, reg, result); > > + } > > + > > + dst = adjust_address (dst, dest_mode, 0); > > + emit_insn (gen_move_insn (dst, result)); > > dest_mode was chosen by smallest_mode_for_size, so can be bigger than n. > Doesn't that mean that we'll write beyond the end of the copy region when > n is an awkward number? > > > diff --git a/gcc/expr.c b/gcc/expr.c > > index > > > 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce > 8e > > e5a19297ab16 100644 > > --- a/gcc/expr.c > > +++ b/gcc/expr.c > > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, > tree > > src) > > > > n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; > > dst_words = XALLOCAVEC (rtx, n_regs); > > - bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); > > + bitsize = BITS_PER_WORD; > > + if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE > (src)))) > > + bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); > > I think this ought to be testing word_mode instead of BLKmode. > (Testing BLKmode doesn't really make sense in general, because the mode > doesn't have a meaningful alignment.) > > Thanks, > Richard
> > + /* Optimize routines for MEM to REG copies. */ if (n < 8 && > > + !REG_P (XEXP (operands[0], 0))) > > This seems to be checking that the address of the original destination > memory isn't a plain base register. Why's it important to reject that case but > allow e.g. base+offset? this function is (as far as I could tell) only being called with two types of destinations: a location on the stack or a plain register. When the destination is a register such as with void Fun3(struct struct3 foo3) { L3 = foo3; } You run into the issue you had pointed to below where we might write too much. Ideally the constraint I would like is to check if the destination is either a new register (no data) or that the structure was padded. I couldn't figure out how to do this and the gains over the existing code for this case was quite small. So I just disallowed it leaving it to the existing code, which isn't so bad, only 1 extra instruction. > > > + { > > + unsigned int max_align = UINTVAL (operands[2]); > > + max_align = n < max_align ? max_align : n; > > Might be misunderstanding, but isn't max_align always equal to n here, since > n was set by: Correct, previously this patch was made to allow n < 16. These were left over from the cleanup. I'll correct. > > > + result = gen_rtx_IOR (dest_mode, reg, result); > > + } > > + > > + dst = adjust_address (dst, dest_mode, 0); > > + emit_insn (gen_move_insn (dst, result)); > > dest_mode was chosen by smallest_mode_for_size, so can be bigger than n. > Doesn't that mean that we'll write beyond the end of the copy region when > n is an awkward number? Yes, see my answer above. For the other case, when we write onto a location on the stack, this is fine due to the alignment. > > > diff --git a/gcc/expr.c b/gcc/expr.c > > index > > > 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce > 8e > > e5a19297ab16 100644 > > --- a/gcc/expr.c > > +++ b/gcc/expr.c > > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, > tree > > src) > > > > n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; > > dst_words = XALLOCAVEC (rtx, n_regs); > > - bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); > > + bitsize = BITS_PER_WORD; > > + if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE > (src)))) > > + bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); > > I think this ought to be testing word_mode instead of BLKmode. > (Testing BLKmode doesn't really make sense in general, because the mode > doesn't have a meaningful alignment.) Ah, yes that makes sense. I'll update the patch. New patch is validating and will submit it soon. > > Thanks, > Richard
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands) base = copy_to_mode_reg (Pmode, XEXP (src, 0)); src = adjust_automodify_address (src, VOIDmode, base, 0); + /* Optimize routines for MEM to REG copies. */ + if (n < 8 && !REG_P (XEXP (operands[0], 0))) + { + unsigned int max_align = UINTVAL (operands[2]); + max_align = n < max_align ? max_align : n; + machine_mode mov_mode, dest_mode + = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT); + rtx result = gen_reg_rtx (dest_mode); + emit_insn (gen_move_insn (result, GEN_INT (0))); + + unsigned int shift_cnt = 0; + for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode)) + { + int nearest = 0; + /* Find the mode to use, but limit the max to TI mode. */ + for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2) + nearest = max; + + mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT); + rtx reg = gen_reg_rtx (mov_mode); + + src = adjust_address (src, mov_mode, 0); + emit_insn (gen_move_insn (reg, src)); + src = aarch64_progress_pointer (src); + + reg = gen_rtx_ASHIFT (dest_mode, reg, + GEN_INT (shift_cnt * BITS_PER_UNIT)); + result = gen_rtx_IOR (dest_mode, reg, result); + } + + dst = adjust_address (dst, dest_mode, 0); + emit_insn (gen_move_insn (dst, result)); + return true; + } + /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a 1-byte chunk. */ if (n < 4) diff --git a/gcc/expr.c b/gcc/expr.c index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src) n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; dst_words = XALLOCAVEC (rtx, n_regs); - bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); + bitsize = BITS_PER_WORD; + if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src)))) + bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD); /* Copy the structure BITSIZE bits at a time. */ for (bitpos = 0, xbitpos = padding_correction;