@@ -24,10 +24,10 @@ (define_peephole2
(set (match_operand:GPI 2 "register_operand" "")
(match_operand:GPI 3 "memory_operand" ""))]
"aarch64_operands_ok_for_ldpstp (operands, true, <MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true);
+ DONE;
})
(define_peephole2
@@ -36,10 +36,10 @@ (define_peephole2
(set (match_operand:GPI 2 "memory_operand" "")
(match_operand:GPI 3 "aarch64_reg_or_zero" ""))]
"aarch64_operands_ok_for_ldpstp (operands, false, <MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, false);
+ aarch64_finish_ldpstp_peephole (operands, false);
+ DONE;
})
(define_peephole2
@@ -48,10 +48,10 @@ (define_peephole2
(set (match_operand:GPF 2 "register_operand" "")
(match_operand:GPF 3 "memory_operand" ""))]
"aarch64_operands_ok_for_ldpstp (operands, true, <MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true);
+ DONE;
})
(define_peephole2
@@ -60,10 +60,10 @@ (define_peephole2
(set (match_operand:GPF 2 "memory_operand" "")
(match_operand:GPF 3 "aarch64_reg_or_fp_zero" ""))]
"aarch64_operands_ok_for_ldpstp (operands, false, <MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, false);
+ aarch64_finish_ldpstp_peephole (operands, false);
+ DONE;
})
(define_peephole2
@@ -72,10 +72,10 @@ (define_peephole2
(set (match_operand:DREG2 2 "register_operand" "")
(match_operand:DREG2 3 "memory_operand" ""))]
"aarch64_operands_ok_for_ldpstp (operands, true, <DREG:MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true);
+ DONE;
})
(define_peephole2
@@ -84,10 +84,10 @@ (define_peephole2
(set (match_operand:DREG2 2 "memory_operand" "")
(match_operand:DREG2 3 "register_operand" ""))]
"aarch64_operands_ok_for_ldpstp (operands, false, <DREG:MODE>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, false);
+ aarch64_finish_ldpstp_peephole (operands, false);
+ DONE;
})
(define_peephole2
@@ -99,10 +99,10 @@ (define_peephole2
&& aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true);
+ DONE;
})
(define_peephole2
@@ -114,10 +114,10 @@ (define_peephole2
&& aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode)
&& (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, false);
+ aarch64_finish_ldpstp_peephole (operands, false);
+ DONE;
})
@@ -129,10 +129,10 @@ (define_peephole2
(set (match_operand:DI 2 "register_operand" "")
(sign_extend:DI (match_operand:SI 3 "memory_operand" "")))]
"aarch64_operands_ok_for_ldpstp (operands, true, SImode)"
- [(parallel [(set (match_dup 0) (sign_extend:DI (match_dup 1)))
- (set (match_dup 2) (sign_extend:DI (match_dup 3)))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true, SIGN_EXTEND);
+ DONE;
})
(define_peephole2
@@ -141,10 +141,10 @@ (define_peephole2
(set (match_operand:DI 2 "register_operand" "")
(zero_extend:DI (match_operand:SI 3 "memory_operand" "")))]
"aarch64_operands_ok_for_ldpstp (operands, true, SImode)"
- [(parallel [(set (match_dup 0) (zero_extend:DI (match_dup 1)))
- (set (match_dup 2) (zero_extend:DI (match_dup 3)))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, true);
+ aarch64_finish_ldpstp_peephole (operands, true, ZERO_EXTEND);
+ DONE;
})
;; Handle storing of a floating point zero with integer data.
@@ -163,10 +163,10 @@ (define_peephole2
(set (match_operand:<FCVT_TARGET> 2 "memory_operand" "")
(match_operand:<FCVT_TARGET> 3 "aarch64_reg_zero_or_fp_zero" ""))]
"aarch64_operands_ok_for_ldpstp (operands, false, <V_INT_EQUIV>mode)"
- [(parallel [(set (match_dup 0) (match_dup 1))
- (set (match_dup 2) (match_dup 3))])]
+ [(const_int 0)]
{
- aarch64_swap_ldrstr_operands (operands, false);
+ aarch64_finish_ldpstp_peephole (operands, false);
+ DONE;
})
;; Handle consecutive load/store whose offset is out of the range
@@ -96,9 +96,13 @@ INT_MODE (XI, 64);
/* V8DI mode. */
VECTOR_MODE_WITH_PREFIX (V, INT, DI, 8, 5);
-
ADJUST_ALIGNMENT (V8DI, 8);
+/* V2x4QImode. Used in load/store pair patterns. */
+VECTOR_MODE_WITH_PREFIX (V2x, INT, QI, 4, 5);
+ADJUST_NUNITS (V2x4QI, 8);
+ADJUST_ALIGNMENT (V2x4QI, 4);
+
/* Define Advanced SIMD modes for structures of 2, 3 and 4 d-registers. */
#define ADV_SIMD_D_REG_STRUCT_MODES(NVECS, VB, VH, VS, VD) \
VECTOR_MODES_WITH_PREFIX (V##NVECS##x, INT, 8, 3); \
@@ -980,6 +980,8 @@ void aarch64_split_compare_and_swap (rtx op[]);
void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
bool aarch64_gen_adjusted_ldpstp (rtx *, bool, machine_mode, RTX_CODE);
+void aarch64_finish_ldpstp_peephole (rtx *, bool,
+ enum rtx_code = (enum rtx_code)0);
void aarch64_expand_sve_vec_cmp_int (rtx, rtx_code, rtx, rtx);
bool aarch64_expand_sve_vec_cmp_float (rtx, rtx_code, rtx, rtx, bool);
@@ -1040,8 +1042,9 @@ bool aarch64_mergeable_load_pair_p (machine_mode, rtx, rtx);
bool aarch64_operands_ok_for_ldpstp (rtx *, bool, machine_mode);
bool aarch64_operands_adjust_ok_for_ldpstp (rtx *, bool, machine_mode);
bool aarch64_mem_ok_with_ldpstp_policy_model (rtx, bool, machine_mode);
-void aarch64_swap_ldrstr_operands (rtx *, bool);
bool aarch64_ldpstp_operand_mode_p (machine_mode);
+rtx aarch64_gen_load_pair (rtx, rtx, rtx, enum rtx_code = (enum rtx_code)0);
+rtx aarch64_gen_store_pair (rtx, rtx, rtx);
extern void aarch64_asm_output_pool_epilogue (FILE *, const char *,
tree, HOST_WIDE_INT);
@@ -232,38 +232,6 @@ (define_insn "aarch64_store_lane0<mode>"
[(set_attr "type" "neon_store1_1reg<q>")]
)
-(define_insn "load_pair<DREG:mode><DREG2:mode>"
- [(set (match_operand:DREG 0 "register_operand")
- (match_operand:DREG 1 "aarch64_mem_pair_operand"))
- (set (match_operand:DREG2 2 "register_operand")
- (match_operand:DREG2 3 "memory_operand"))]
- "TARGET_FLOAT
- && rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (<DREG:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type ]
- [ w , Ump , w , m ; neon_ldp ] ldp\t%d0, %d2, %z1
- [ r , Ump , r , m ; load_16 ] ldp\t%x0, %x2, %z1
- }
-)
-
-(define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
- [(set (match_operand:DREG 0 "aarch64_mem_pair_operand")
- (match_operand:DREG 1 "register_operand"))
- (set (match_operand:DREG2 2 "memory_operand")
- (match_operand:DREG2 3 "register_operand"))]
- "TARGET_FLOAT
- && rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (<DREG:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type ]
- [ Ump , w , m , w ; neon_stp ] stp\t%d1, %d3, %z0
- [ Ump , r , m , r ; store_16 ] stp\t%x1, %x3, %z0
- }
-)
-
(define_insn "aarch64_simd_stp<mode>"
[(set (match_operand:VP_2E 0 "aarch64_mem_pair_lanes_operand")
(vec_duplicate:VP_2E (match_operand:<VEL> 1 "register_operand")))]
@@ -274,34 +242,6 @@ (define_insn "aarch64_simd_stp<mode>"
}
)
-(define_insn "load_pair<VQ:mode><VQ2:mode>"
- [(set (match_operand:VQ 0 "register_operand" "=w")
- (match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
- (set (match_operand:VQ2 2 "register_operand" "=w")
- (match_operand:VQ2 3 "memory_operand" "m"))]
- "TARGET_FLOAT
- && rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (<VQ:MODE>mode)))"
- "ldp\\t%q0, %q2, %z1"
- [(set_attr "type" "neon_ldp_q")]
-)
-
-(define_insn "vec_store_pair<VQ:mode><VQ2:mode>"
- [(set (match_operand:VQ 0 "aarch64_mem_pair_operand" "=Ump")
- (match_operand:VQ 1 "register_operand" "w"))
- (set (match_operand:VQ2 2 "memory_operand" "=m")
- (match_operand:VQ2 3 "register_operand" "w"))]
- "TARGET_FLOAT
- && rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (<VQ:MODE>mode)))"
- "stp\\t%q1, %q3, %z0"
- [(set_attr "type" "neon_stp_q")]
-)
-
(define_expand "@aarch64_split_simd_mov<mode>"
[(set (match_operand:VQMOV 0)
(match_operand:VQMOV 1))]
@@ -5214,15 +5214,17 @@ aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
/* Combine the sets with any stack allocation/deallocation. */
- rtvec vec;
+ rtx pat;
if (prev_loc->index == 0)
{
rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
- vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
+ rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
+ pat = gen_rtx_PARALLEL (VOIDmode, vec);
}
+ else if (seq == PROLOGUE)
+ pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
else
- vec = gen_rtvec (2, set1, set2);
- rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
+ pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
/* Queue a deallocation to the end, otherwise emit the
instruction now. */
@@ -8176,59 +8178,87 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
}
}
-/* Generate and return a store pair instruction of mode MODE to store
- register REG1 to MEM1 and register REG2 to MEM2. */
+/* Given an ldp/stp register operand mode MODE, return a suitable mode to use
+ for a mem rtx representing the entire pair. */
-static rtx
-aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
- rtx reg2)
-{
- switch (mode)
- {
- case E_DImode:
- return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
+static machine_mode
+aarch64_pair_mode_for_mode (machine_mode mode)
+{
+ if (known_eq (GET_MODE_SIZE (mode), 4))
+ return V2x4QImode;
+ else if (known_eq (GET_MODE_SIZE (mode), 8))
+ return V2x8QImode;
+ else if (known_eq (GET_MODE_SIZE (mode), 16))
+ return V2x16QImode;
+ else
+ gcc_unreachable ();
+}
- case E_DFmode:
- return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
+/* Given a base mem MEM with a mode suitable for an ldp/stp register operand,
+ return an rtx like MEM which instead represents the entire pair. */
- case E_TFmode:
- return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
+static rtx
+aarch64_pair_mem_from_base (rtx mem)
+{
+ auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
+ mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
+ gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
+ return mem;
+}
- case E_V4SImode:
- return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
+/* Generate and return a store pair instruction to store REG1 and REG2
+ into memory starting at BASE_MEM. All three rtxes should have modes of the
+ same size. */
- case E_V16QImode:
- return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
+rtx
+aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
+{
+ rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
- default:
- gcc_unreachable ();
- }
+ return gen_rtx_SET (pair_mem,
+ gen_rtx_UNSPEC (GET_MODE (pair_mem),
+ gen_rtvec (2, reg1, reg2),
+ UNSPEC_STP));
}
-/* Generate and regurn a load pair isntruction of mode MODE to load register
- REG1 from MEM1 and register REG2 from MEM2. */
+/* Generate and return a load pair instruction to load a pair of
+ registers starting at BASE_MEM into REG1 and REG2. If CODE is
+ UNKNOWN, all three rtxes should have modes of the same size.
+ Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
+ and REG{1,2} should be in DImode. */
-static rtx
-aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
- rtx mem2)
+rtx
+aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
{
- switch (mode)
- {
- case E_DImode:
- return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
+ rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
- case E_DFmode:
- return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
-
- case E_TFmode:
- return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
+ const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
+ if (any_extend_p)
+ {
+ gcc_checking_assert (GET_MODE (base_mem) == SImode
+ && GET_MODE (reg1) == DImode
+ && GET_MODE (reg2) == DImode);
+ }
+ else
+ gcc_assert (code == UNKNOWN);
+
+ rtx unspecs[2] = {
+ gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
+ gen_rtvec (1, pair_mem),
+ UNSPEC_LDP_FST),
+ gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
+ gen_rtvec (1, copy_rtx (pair_mem)),
+ UNSPEC_LDP_SND)
+ };
- case E_V4SImode:
- return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
+ if (any_extend_p)
+ for (int i = 0; i < 2; i++)
+ unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
- default:
- gcc_unreachable ();
- }
+ return gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (2,
+ gen_rtx_SET (reg1, unspecs[0]),
+ gen_rtx_SET (reg2, unspecs[1])));
}
/* Return TRUE if return address signing should be enabled for the current
@@ -8411,7 +8441,7 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
}
rtx base_rtx = stack_pointer_rtx;
- poly_int64 sp_offset = offset;
+ poly_int64 cfa_offset = offset;
HOST_WIDE_INT const_offset;
if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
@@ -8436,8 +8466,17 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
offset -= fp_offset;
}
rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
- bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
+ rtx cfa_base = stack_pointer_rtx;
+ if (hard_fp_valid_p && frame_pointer_needed)
+ {
+ cfa_base = hard_frame_pointer_rtx;
+ cfa_offset += (bytes_below_sp - frame.bytes_below_hard_fp);
+ }
+
+ rtx cfa_mem = gen_frame_mem (mode,
+ plus_constant (Pmode,
+ cfa_base, cfa_offset));
unsigned int regno2;
if (!aarch64_sve_mode_p (mode)
&& reg == move_src
@@ -8447,12 +8486,9 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
frame.reg_offset[regno2] - frame.reg_offset[regno]))
{
rtx reg2 = gen_rtx_REG (mode, regno2);
- rtx mem2;
offset += GET_MODE_SIZE (mode);
- mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
- insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
- reg2));
+ insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
/* The first part of a frame-related parallel insn is
always assumed to be relevant to the frame
@@ -8460,31 +8496,28 @@ aarch64_save_callee_saves (poly_int64 bytes_below_sp,
frame-related if explicitly marked. */
if (aarch64_emit_cfi_for_reg_p (regno2))
{
- if (need_cfa_note_p)
- aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
- sp_offset + GET_MODE_SIZE (mode));
- else
- RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
+ const auto off = cfa_offset + GET_MODE_SIZE (mode);
+ rtx cfa_mem2 = gen_frame_mem (mode,
+ plus_constant (Pmode,
+ cfa_base,
+ off));
+ add_reg_note (insn, REG_CFA_OFFSET,
+ gen_rtx_SET (cfa_mem2, reg2));
}
regno = regno2;
++i;
}
else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
- {
- insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, move_src));
- need_cfa_note_p = true;
- }
+ insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, move_src));
else if (aarch64_sve_mode_p (mode))
insn = emit_insn (gen_rtx_SET (mem, move_src));
else
insn = emit_move_insn (mem, move_src);
RTX_FRAME_RELATED_P (insn) = frame_related_p;
- if (frame_related_p && need_cfa_note_p)
- aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
- else if (frame_related_p && move_src != reg)
- add_reg_note (insn, REG_FRAME_RELATED_EXPR, gen_rtx_SET (mem, reg));
+ if (frame_related_p)
+ add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (cfa_mem, reg));
/* Emit a fake instruction to indicate that the VG save slot has
been initialized. */
@@ -8548,11 +8581,9 @@ aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
frame.reg_offset[regno2] - frame.reg_offset[regno]))
{
rtx reg2 = gen_rtx_REG (mode, regno2);
- rtx mem2;
offset += GET_MODE_SIZE (mode);
- mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
- emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
+ emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
regno = regno2;
@@ -8896,9 +8927,9 @@ aarch64_process_components (sbitmap components, bool prologue_p)
: gen_rtx_SET (reg2, mem2);
if (prologue_p)
- insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
+ insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
else
- insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
+ insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
if (frame_related_p || frame_related2_p)
{
@@ -10294,12 +10325,18 @@ aarch64_classify_address (struct aarch64_address_info *info,
mode of the corresponding addressing mode is half of that. */
if (type == ADDR_QUERY_LDP_STP_N)
{
- if (known_eq (GET_MODE_SIZE (mode), 16))
+ if (known_eq (GET_MODE_SIZE (mode), 32))
+ mode = V16QImode;
+ else if (known_eq (GET_MODE_SIZE (mode), 16))
mode = DFmode;
else if (known_eq (GET_MODE_SIZE (mode), 8))
mode = SFmode;
else
return false;
+
+ /* This isn't really an Advanced SIMD struct mode, but a mode
+ used to represent the complete mem in a load/store pair. */
+ advsimd_struct_p = false;
}
bool allow_reg_index_p = (!load_store_pair_p
@@ -10917,9 +10954,7 @@ aarch64_init_tpidr2_block ()
/* The first word of the block points to the save buffer and the second
word is the number of ZA slices to save. */
rtx block_0 = adjust_address (block, DImode, 0);
- rtx block_8 = adjust_address (block, DImode, 8);
- emit_insn (gen_store_pair_dw_didi (block_0, za_save_buffer,
- block_8, svl_bytes_reg));
+ emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
if (!memory_operand (block, V16QImode))
block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
@@ -12268,7 +12303,8 @@ aarch64_print_operand (FILE *f, rtx x, int code)
if (!MEM_P (x)
|| (code == 'y'
&& maybe_ne (GET_MODE_SIZE (mode), 8)
- && maybe_ne (GET_MODE_SIZE (mode), 16)))
+ && maybe_ne (GET_MODE_SIZE (mode), 16)
+ && maybe_ne (GET_MODE_SIZE (mode), 32)))
{
output_operand_lossage ("invalid operand for '%%%c'", code);
return;
@@ -25432,10 +25468,8 @@ aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
*src = adjust_address (*src, mode, 0);
*dst = adjust_address (*dst, mode, 0);
/* Emit the memcpy. */
- emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
- aarch64_progress_pointer (*src)));
- emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
- aarch64_progress_pointer (*dst), reg2));
+ emit_insn (aarch64_gen_load_pair (reg1, reg2, *src));
+ emit_insn (aarch64_gen_store_pair (*dst, reg1, reg2));
/* Move the pointers forward. */
*src = aarch64_move_pointer (*src, 32);
*dst = aarch64_move_pointer (*dst, 32);
@@ -25614,8 +25648,7 @@ aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
/* "Cast" the *dst to the correct mode. */
*dst = adjust_address (*dst, mode, 0);
/* Emit the memset. */
- emit_insn (aarch64_gen_store_pair (mode, *dst, src,
- aarch64_progress_pointer (*dst), src));
+ emit_insn (aarch64_gen_store_pair (*dst, src, src));
/* Move the pointers forward. */
*dst = aarch64_move_pointer (*dst, 32);
@@ -26812,6 +26845,29 @@ aarch64_swap_ldrstr_operands (rtx* operands, bool load)
}
}
+/* Helper function used for generation of load/store pair instructions, called
+ from peepholes in aarch64-ldpstp.md. OPERANDS is an array of
+ operands as matched by the peepholes in that file. LOAD_P is true if we're
+ generating a load pair, otherwise we're generating a store pair. CODE is
+ either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
+ standard load/store pair. */
+
+void
+aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
+{
+ aarch64_swap_ldrstr_operands (operands, load_p);
+
+ if (load_p)
+ emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
+ operands[1], code));
+ else
+ {
+ gcc_assert (code == UNKNOWN);
+ emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
+ operands[3]));
+ }
+}
+
/* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
comparison between the two. */
int
@@ -26993,10 +27049,10 @@ bool
aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
machine_mode mode, RTX_CODE code)
{
- rtx base, offset_1, offset_3, t1, t2;
- rtx mem_1, mem_2, mem_3, mem_4;
+ rtx base, offset_1, offset_2;
+ rtx mem_1, mem_2;
rtx temp_operands[8];
- HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
+ HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
stp_off_upper_limit, stp_off_lower_limit, msize;
/* We make changes on a copy as we may still bail out. */
@@ -27019,23 +27075,19 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
if (load)
{
mem_1 = copy_rtx (temp_operands[1]);
- mem_2 = copy_rtx (temp_operands[3]);
- mem_3 = copy_rtx (temp_operands[5]);
- mem_4 = copy_rtx (temp_operands[7]);
+ mem_2 = copy_rtx (temp_operands[5]);
}
else
{
mem_1 = copy_rtx (temp_operands[0]);
- mem_2 = copy_rtx (temp_operands[2]);
- mem_3 = copy_rtx (temp_operands[4]);
- mem_4 = copy_rtx (temp_operands[6]);
+ mem_2 = copy_rtx (temp_operands[4]);
gcc_assert (code == UNKNOWN);
}
extract_base_offset_in_addr (mem_1, &base, &offset_1);
- extract_base_offset_in_addr (mem_3, &base, &offset_3);
+ extract_base_offset_in_addr (mem_2, &base, &offset_2);
gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
- && offset_3 != NULL_RTX);
+ && offset_2 != NULL_RTX);
/* Adjust offset so it can fit in LDP/STP instruction. */
msize = GET_MODE_SIZE (mode).to_constant();
@@ -27043,11 +27095,11 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
stp_off_lower_limit = - msize * 0x40;
off_val_1 = INTVAL (offset_1);
- off_val_3 = INTVAL (offset_3);
+ off_val_2 = INTVAL (offset_2);
/* The base offset is optimally half way between the two STP/LDP offsets. */
if (msize <= 4)
- base_off = (off_val_1 + off_val_3) / 2;
+ base_off = (off_val_1 + off_val_2) / 2;
else
/* However, due to issues with negative LDP/STP offset generation for
larger modes, for DF, DD, DI and vector modes. we must not use negative
@@ -27087,73 +27139,58 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
new_off_1 = off_val_1 - base_off;
/* Offset of the second STP/LDP. */
- new_off_3 = off_val_3 - base_off;
+ new_off_2 = off_val_2 - base_off;
/* The offsets must be within the range of the LDP/STP instructions. */
if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
- || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
+ || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
return false;
replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
new_off_1), true);
replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
- new_off_1 + msize), true);
- replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
- new_off_3), true);
- replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
- new_off_3 + msize), true);
+ new_off_2), true);
if (!aarch64_mem_pair_operand (mem_1, mode)
- || !aarch64_mem_pair_operand (mem_3, mode))
+ || !aarch64_mem_pair_operand (mem_2, mode))
return false;
- if (code == ZERO_EXTEND)
- {
- mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
- mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
- mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
- mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
- }
- else if (code == SIGN_EXTEND)
- {
- mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
- mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
- mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
- mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
- }
-
if (load)
{
operands[0] = temp_operands[0];
operands[1] = mem_1;
operands[2] = temp_operands[2];
- operands[3] = mem_2;
operands[4] = temp_operands[4];
- operands[5] = mem_3;
+ operands[5] = mem_2;
operands[6] = temp_operands[6];
- operands[7] = mem_4;
}
else
{
operands[0] = mem_1;
operands[1] = temp_operands[1];
- operands[2] = mem_2;
operands[3] = temp_operands[3];
- operands[4] = mem_3;
+ operands[4] = mem_2;
operands[5] = temp_operands[5];
- operands[6] = mem_4;
operands[7] = temp_operands[7];
}
/* Emit adjusting instruction. */
emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
/* Emit ldp/stp instructions. */
- t1 = gen_rtx_SET (operands[0], operands[1]);
- t2 = gen_rtx_SET (operands[2], operands[3]);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
- t1 = gen_rtx_SET (operands[4], operands[5]);
- t2 = gen_rtx_SET (operands[6], operands[7]);
- emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
+ if (load)
+ {
+ emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
+ operands[1], code));
+ emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
+ operands[5], code));
+ }
+ else
+ {
+ emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
+ operands[3]));
+ emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
+ operands[7]));
+ }
return true;
}
@@ -228,6 +228,9 @@ (define_c_enum "unspec" [
UNSPEC_GOTSMALLTLS
UNSPEC_GOTTINYPIC
UNSPEC_GOTTINYTLS
+ UNSPEC_STP
+ UNSPEC_LDP_FST
+ UNSPEC_LDP_SND
UNSPEC_LD1
UNSPEC_LD2
UNSPEC_LD2_DREG
@@ -527,6 +530,11 @@ (define_attr "predicated" "yes,no" (const_string "no"))
;; may chose to hold the tracking state encoded in SP.
(define_attr "speculation_barrier" "true,false" (const_string "false"))
+;; Attribute use to identify load pair and store pair instructions.
+;; Currently the attribute is only applied to the non-writeback ldp/stp
+;; patterns.
+(define_attr "ldpstp" "ldp,stp,none" (const_string "none"))
+
;; -------------------------------------------------------------------
;; Pipeline descriptions and scheduling
;; -------------------------------------------------------------------
@@ -1823,100 +1831,62 @@ (define_expand "setmemdi"
FAIL;
})
-;; Operands 1 and 3 are tied together by the final condition; so we allow
-;; fairly lax checking on the second memory operation.
-(define_insn "load_pair_sw_<SX:mode><SX2:mode>"
- [(set (match_operand:SX 0 "register_operand")
- (match_operand:SX 1 "aarch64_mem_pair_operand"))
- (set (match_operand:SX2 2 "register_operand")
- (match_operand:SX2 3 "memory_operand"))]
- "rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (<SX:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type , arch ]
- [ r , Ump , r , m ; load_8 , * ] ldp\t%w0, %w2, %z1
- [ w , Ump , w , m ; neon_load1_2reg , fp ] ldp\t%s0, %s2, %z1
- }
-)
-
-;; Storing different modes that can still be merged
-(define_insn "load_pair_dw_<DX:mode><DX2:mode>"
- [(set (match_operand:DX 0 "register_operand")
- (match_operand:DX 1 "aarch64_mem_pair_operand"))
- (set (match_operand:DX2 2 "register_operand")
- (match_operand:DX2 3 "memory_operand"))]
- "rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (<DX:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type , arch ]
- [ r , Ump , r , m ; load_16 , * ] ldp\t%x0, %x2, %z1
- [ w , Ump , w , m ; neon_load1_2reg , fp ] ldp\t%d0, %d2, %z1
- }
-)
-
-(define_insn "load_pair_dw_<TX:mode><TX2:mode>"
- [(set (match_operand:TX 0 "register_operand" "=w")
- (match_operand:TX 1 "aarch64_mem_pair_operand" "Ump"))
- (set (match_operand:TX2 2 "register_operand" "=w")
- (match_operand:TX2 3 "memory_operand" "m"))]
- "TARGET_BASE_SIMD
- && rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (<TX:MODE>mode)))"
- "ldp\\t%q0, %q2, %z1"
+(define_insn "*load_pair_<ldst_sz>"
+ [(set (match_operand:GPI 0 "aarch64_ldp_reg_operand")
+ (unspec [
+ (match_operand:<VPAIR> 1 "aarch64_mem_pair_lanes_operand")
+ ] UNSPEC_LDP_FST))
+ (set (match_operand:GPI 2 "aarch64_ldp_reg_operand")
+ (unspec [
+ (match_dup 1)
+ ] UNSPEC_LDP_SND))]
+ ""
+ {@ [cons: =0, 1, =2; attrs: type, arch]
+ [ r, Umn, r; load_<ldpstp_sz>, * ] ldp\t%<w>0, %<w>2, %y1
+ [ w, Umn, w; neon_load1_2reg, fp ] ldp\t%<v>0, %<v>2, %y1
+ }
+ [(set_attr "ldpstp" "ldp")]
+)
+
+(define_insn "*load_pair_16"
+ [(set (match_operand:TI 0 "aarch64_ldp_reg_operand" "=w")
+ (unspec [
+ (match_operand:V2x16QI 1 "aarch64_mem_pair_lanes_operand" "Umn")
+ ] UNSPEC_LDP_FST))
+ (set (match_operand:TI 2 "aarch64_ldp_reg_operand" "=w")
+ (unspec [
+ (match_dup 1)
+ ] UNSPEC_LDP_SND))]
+ "TARGET_FLOAT"
+ "ldp\\t%q0, %q2, %y1"
[(set_attr "type" "neon_ldp_q")
- (set_attr "fp" "yes")]
-)
-
-;; Operands 0 and 2 are tied together by the final condition; so we allow
-;; fairly lax checking on the second memory operation.
-(define_insn "store_pair_sw_<SX:mode><SX2:mode>"
- [(set (match_operand:SX 0 "aarch64_mem_pair_operand")
- (match_operand:SX 1 "aarch64_reg_zero_or_fp_zero"))
- (set (match_operand:SX2 2 "memory_operand")
- (match_operand:SX2 3 "aarch64_reg_zero_or_fp_zero"))]
- "rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (<SX:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type , arch ]
- [ Ump , rYZ , m , rYZ ; store_8 , * ] stp\t%w1, %w3, %z0
- [ Ump , w , m , w ; neon_store1_2reg , fp ] stp\t%s1, %s3, %z0
- }
-)
-
-;; Storing different modes that can still be merged
-(define_insn "store_pair_dw_<DX:mode><DX2:mode>"
- [(set (match_operand:DX 0 "aarch64_mem_pair_operand")
- (match_operand:DX 1 "aarch64_reg_zero_or_fp_zero"))
- (set (match_operand:DX2 2 "memory_operand")
- (match_operand:DX2 3 "aarch64_reg_zero_or_fp_zero"))]
- "rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (<DX:MODE>mode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type , arch ]
- [ Ump , rYZ , m , rYZ ; store_16 , * ] stp\t%x1, %x3, %z0
- [ Ump , w , m , w ; neon_store1_2reg , fp ] stp\t%d1, %d3, %z0
- }
-)
-
-(define_insn "store_pair_dw_<TX:mode><TX2:mode>"
- [(set (match_operand:TX 0 "aarch64_mem_pair_operand" "=Ump")
- (match_operand:TX 1 "register_operand" "w"))
- (set (match_operand:TX2 2 "memory_operand" "=m")
- (match_operand:TX2 3 "register_operand" "w"))]
- "TARGET_BASE_SIMD
- && rtx_equal_p (XEXP (operands[2], 0),
- plus_constant (Pmode,
- XEXP (operands[0], 0),
- GET_MODE_SIZE (TFmode)))"
- "stp\\t%q1, %q3, %z0"
+ (set_attr "fp" "yes")
+ (set_attr "ldpstp" "ldp")]
+)
+
+(define_insn "*store_pair_<ldst_sz>"
+ [(set (match_operand:<VPAIR> 0 "aarch64_mem_pair_lanes_operand")
+ (unspec:<VPAIR>
+ [(match_operand:GPI 1 "aarch64_stp_reg_operand")
+ (match_operand:GPI 2 "aarch64_stp_reg_operand")] UNSPEC_STP))]
+ ""
+ {@ [cons: =0, 1, 2; attrs: type , arch]
+ [ Umn, rYZ, rYZ; store_<ldpstp_sz>, * ] stp\t%<w>1, %<w>2, %y0
+ [ Umn, w, w; neon_store1_2reg , fp ] stp\t%<v>1, %<v>2, %y0
+ }
+ [(set_attr "ldpstp" "stp")]
+)
+
+(define_insn "*store_pair_16"
+ [(set (match_operand:V2x16QI 0 "aarch64_mem_pair_lanes_operand" "=Umn")
+ (unspec:V2x16QI
+ [(match_operand:TI 1 "aarch64_ldp_reg_operand" "w")
+ (match_operand:TI 2 "aarch64_ldp_reg_operand" "w")] UNSPEC_STP))]
+ "TARGET_FLOAT"
+ "stp\t%q1, %q2, %y0"
[(set_attr "type" "neon_stp_q")
- (set_attr "fp" "yes")]
+ (set_attr "fp" "yes")
+ (set_attr "ldpstp" "stp")]
)
;; Writeback load/store pair patterns.
@@ -2146,14 +2116,15 @@ (define_insn "*extendsidi2_aarch64"
(define_insn "*load_pair_extendsidi2_aarch64"
[(set (match_operand:DI 0 "register_operand" "=r")
- (sign_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand" "Ump")))
+ (sign_extend:DI (unspec:SI [
+ (match_operand:V2x4QI 1 "aarch64_mem_pair_lanes_operand" "Umn")
+ ] UNSPEC_LDP_FST)))
(set (match_operand:DI 2 "register_operand" "=r")
- (sign_extend:DI (match_operand:SI 3 "memory_operand" "m")))]
- "rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (SImode)))"
- "ldpsw\\t%0, %2, %z1"
+ (sign_extend:DI (unspec:SI [
+ (match_dup 1)
+ ] UNSPEC_LDP_SND)))]
+ ""
+ "ldpsw\\t%0, %2, %y1"
[(set_attr "type" "load_8")]
)
@@ -2173,16 +2144,17 @@ (define_insn "*zero_extendsidi2_aarch64"
(define_insn "*load_pair_zero_extendsidi2_aarch64"
[(set (match_operand:DI 0 "register_operand")
- (zero_extend:DI (match_operand:SI 1 "aarch64_mem_pair_operand")))
+ (zero_extend:DI (unspec:SI [
+ (match_operand:V2x4QI 1 "aarch64_mem_pair_lanes_operand")
+ ] UNSPEC_LDP_FST)))
(set (match_operand:DI 2 "register_operand")
- (zero_extend:DI (match_operand:SI 3 "memory_operand")))]
- "rtx_equal_p (XEXP (operands[3], 0),
- plus_constant (Pmode,
- XEXP (operands[1], 0),
- GET_MODE_SIZE (SImode)))"
- {@ [ cons: =0 , 1 , =2 , 3 ; attrs: type , arch ]
- [ r , Ump , r , m ; load_8 , * ] ldp\t%w0, %w2, %z1
- [ w , Ump , w , m ; neon_load1_2reg , fp ] ldp\t%s0, %s2, %z1
+ (zero_extend:DI (unspec:SI [
+ (match_dup 1)
+ ] UNSPEC_LDP_SND)))]
+ ""
+ {@ [ cons: =0 , 1 , =2; attrs: type , arch]
+ [ r , Umn , r ; load_8 , * ] ldp\t%w0, %w2, %y1
+ [ w , Umn , w ; neon_load1_2reg, fp ] ldp\t%s0, %s2, %y1
}
)
@@ -1604,6 +1604,9 @@ (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI")
(SI "V2SI") (SF "V2SF")
(DI "V2DI") (DF "V2DF")])
+;; Load/store pair mode.
+(define_mode_attr VPAIR [(SI "V2x4QI") (DI "V2x8QI")])
+
;; Register suffix for double-length mode.
(define_mode_attr Vdtype [(V4HF "8h") (V2SF "4s")])
@@ -300,10 +300,12 @@ (define_special_predicate "aarch64_mem_pair_operator"
(match_test "known_eq (GET_MODE_SIZE (mode),
GET_MODE_SIZE (GET_MODE (op)))"))))
-(define_predicate "aarch64_mem_pair_operand"
- (and (match_code "mem")
- (match_test "aarch64_legitimate_address_p (mode, XEXP (op, 0), false,
- ADDR_QUERY_LDP_STP)")))
+;; Like aarch64_mem_pair_operator, but additionally check the
+;; address is suitable.
+(define_special_predicate "aarch64_mem_pair_operand"
+ (and (match_operand 0 "aarch64_mem_pair_operator")
+ (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
+ false, ADDR_QUERY_LDP_STP)")))
(define_predicate "pmode_plus_operator"
(and (match_code "plus")