@@ -306,9 +306,18 @@ private:
// equivalent to EQUIV_ALLOCNO for the whole of this allocno's lifetime.
unsigned int equiv_allocno;
- // The next chained allocno in program order (i.e. at lower program
- // points), or INVALID_ALLOCNO if none.
- unsigned int chain_next;
+ union
+ {
+ // The program point at which the allocno was last defined,
+ // or START_OF_REGION if none. This is only used temporarily
+ // while recording allocnos; after that, chain_next below is
+ // used instead.
+ unsigned int last_def_point;
+
+ // The next chained allocno in program order (i.e. at lower program
+ // points), or INVALID_ALLOCNO if none.
+ unsigned int chain_next;
+ };
// The previous chained allocno in program order (i.e. at higher
// program points), or INVALID_ALLOCNO if none.
@@ -406,6 +415,7 @@ private:
void record_fpr_def (unsigned int);
void record_allocno_use (allocno_info *);
void record_allocno_def (allocno_info *);
+ bool valid_equivalence_p (allocno_info *, allocno_info *);
void record_copy (rtx, rtx, bool = false);
void record_constraints (rtx_insn *);
void record_artificial_refs (unsigned int);
@@ -479,6 +489,9 @@ private:
// The basic block that we're currently processing.
basic_block m_current_bb;
+ // The lowest-numbered program point in the current basic block.
+ unsigned int m_current_bb_point;
+
// The program point that we're currently processing (described above).
unsigned int m_current_point;
@@ -576,21 +589,26 @@ likely_operand_match_p (const operand_alternative &op_alt, rtx op)
return true;
auto cn = lookup_constraint (constraint);
- if (REG_P (op) || SUBREG_P (op))
+ switch (get_constraint_type (cn))
{
- if (insn_extra_register_constraint (cn))
+ case CT_REGISTER:
+ if (REG_P (op) || SUBREG_P (op))
return true;
- }
- else if (MEM_P (op))
- {
- if (insn_extra_memory_constraint (cn))
+ break;
+
+ case CT_MEMORY:
+ case CT_SPECIAL_MEMORY:
+ case CT_RELAXED_MEMORY:
+ if (MEM_P (op))
return true;
- }
- else
- {
- if (!insn_extra_memory_constraint (cn)
- && constraint_satisfied_p (op, cn))
+ break;
+
+ case CT_CONST_INT:
+ case CT_ADDRESS:
+ case CT_FIXED_FORM:
+ if (constraint_satisfied_p (op, cn))
return true;
+ break;
}
constraint += len;
@@ -1407,10 +1425,14 @@ early_ra::record_allocno_use (allocno_info *allocno)
{
bitmap_set_bit (m_live_allocnos, allocno->id);
if (allocno->end_point > m_current_point)
- allocno->end_point = m_current_point;
+ {
+ allocno->end_point = m_current_point;
+ allocno->last_def_point = START_OF_REGION;
+ }
allocno->start_point = m_current_point;
allocno->is_copy_dest = false;
allocno->is_strong_copy_dest = false;
+ allocno->equiv_allocno = INVALID_ALLOCNO;
}
// Record a definition of the allocno with index AI at the current program
@@ -1419,6 +1441,7 @@ early_ra::record_allocno_use (allocno_info *allocno)
void
early_ra::record_allocno_def (allocno_info *allocno)
{
+ allocno->last_def_point = m_current_point;
allocno->start_point = m_current_point;
allocno->num_defs = MIN (allocno->num_defs + 1, 2);
gcc_checking_assert (!allocno->is_copy_dest
@@ -1427,6 +1450,30 @@ early_ra::record_allocno_def (allocno_info *allocno)
gcc_unreachable ();
}
+// Return true if a move from SRC_ALLOCNO to DEST_ALLOCNO could be treated
+// as an equivalence.
+bool
+early_ra::valid_equivalence_p (allocno_info *dest_allocno,
+ allocno_info *src_allocno)
+{
+ if (src_allocno->end_point > dest_allocno->end_point)
+ // The src allocno dies first.
+ return false;
+
+ if (src_allocno->num_defs != 0)
+ {
+ if (dest_allocno->end_point < m_current_bb_point)
+ // We don't currently track enough information to handle multiple
+ // definitions across basic block boundaries.
+ return false;
+
+ if (src_allocno->last_def_point >= dest_allocno->end_point)
+ // There is another definition during the destination's live range.
+ return false;
+ }
+ return dest_allocno->num_defs == 1;
+}
+
// Record any relevant allocno-related information for an actual or imagined
// copy from SRC to DEST. FROM_MOVE_P is true if the copy was an explicit
// move instruction, false if it represents one way of satisfying the previous
@@ -1512,9 +1559,7 @@ early_ra::record_copy (rtx dest, rtx src, bool from_move_p)
dest_allocno->is_copy_dest = 1;
}
else if (from_move_p
- && src_allocno->end_point <= dest_allocno->end_point
- && src_allocno->num_defs == 0
- && dest_allocno->num_defs == 1)
+ && valid_equivalence_p (dest_allocno, src_allocno))
dest_allocno->equiv_allocno = src_allocno->id;
}
}
@@ -3048,6 +3093,9 @@ early_ra::apply_allocation ()
void
early_ra::process_region ()
{
+ for (auto *allocno : m_allocnos)
+ allocno->chain_next = INVALID_ALLOCNO;
+
if (dump_file && (dump_flags & TDF_DETAILS))
{
dump_fpr_ranges ();
@@ -3117,6 +3165,8 @@ void
early_ra::process_block (basic_block bb, bool is_isolated)
{
m_current_bb = bb;
+ m_current_point += 1;
+ m_current_bb_point = m_current_point;
// Process live-out FPRs.
bitmap live_out = df_get_live_out (bb);
@@ -3414,8 +3464,7 @@ pass_early_ra::execute (function *fn)
} // end namespace
-// Create a new CC fusion pass instance.
-
+// Create a new instance of the pass.
rtl_opt_pass *
make_pass_aarch64_early_ra (gcc::context *ctxt)
{
new file mode 100644
@@ -0,0 +1,115 @@
+// { dg-options "-O2 -fno-schedule-insns -fno-schedule-insns2" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sme.h>
+
+#pragma GCC target "+sme2"
+
+// This file deliberately contains nonsense code.
+
+/*
+** test1:
+** ptrue (pn[0-9]+)\.s
+** ld1w {z16\.s - z19\.s}, \1/z, \[x1\]
+** ld1w {z20\.s - z23\.s}, \1/z, \[x1, #4, mul vl\]
+** ld1w {z24\.s - z27\.s}, \1/z, \[x1, #8, mul vl\]
+** ld1w {z28\.s - z31\.s}, \1/z, \[x1, #12, mul vl\]
+** ptrue [^\n]+
+** ld1rqw [^\n]+
+** ld1rqw [^\n]+
+** sclamp {z16.s - z19.s}, [^\n]+
+** sclamp {z20.s - z23.s}, [^\n]+
+** sclamp {z24.s - z27.s}, [^\n]+
+** sclamp {z28.s - z31.s}, [^\n]+
+** st1w {z16\.s, z20\.s, z24\.s, z28\.s}, \1, \[x0\]
+** st1w {z17\.s, z21\.s, z25\.s, z29\.s}, \1, \[x0, #4, mul vl\]
+** st1w {z18\.s, z22\.s, z26\.s, z30\.s}, \1, \[x0, #8, mul vl\]
+** st1w {z19\.s, z23\.s, z27\.s, z31\.s}, \1, \[x0, #12, mul vl\]
+** st1w {z16\.s, z20\.s, z24\.s, z28\.s}, \1, \[x0, #16, mul vl\]
+** st1w {z17\.s, z21\.s, z25\.s, z29\.s}, \1, \[x0, #20, mul vl\]
+** st1w {z18\.s, z22\.s, z26\.s, z30\.s}, \1, \[x0, #24, mul vl\]
+** st1w {z19\.s, z23\.s, z27\.s, z31\.s}, \1, \[x0, #28, mul vl\]
+** ld1w {z16\.s - z19\.s}, \1/z, \[x3\]
+** ld1w {z20\.s - z23\.s}, \1/z, \[x3, #4, mul vl\]
+** ld1w {z24\.s - z27\.s}, \1/z, \[x3, #8, mul vl\]
+** ld1w {z28\.s - z31\.s}, \1/z, \[x3, #12, mul vl\]
+** sclamp {z16.s - z19.s}, [^\n]+
+** sclamp {z20.s - z23.s}, [^\n]+
+** sclamp {z24.s - z27.s}, [^\n]+
+** sclamp {z28.s - z31.s}, [^\n]+
+** ...
+** ret
+*/
+void test1(int32_t *dest, int32_t *src1, int32_t *src2,
+ int32_t *src3) __arm_streaming
+{
+ svcount_t pg = svptrue_c32();
+ svint32x4_t l0 = svld1_vnum_x4(pg, src1, 0);
+ svint32x4_t l1 = svld1_vnum_x4(pg, src1, 4);
+ svint32x4_t l2 = svld1_vnum_x4(pg, src1, 8);
+ svint32x4_t l3 = svld1_vnum_x4(pg, src1, 12);
+ svint32_t l4 = svld1rq(svptrue_b32(), src2);
+ svint32_t l5 = svld1rq(svptrue_b32(), src2 + 4);
+ l0 = svclamp(l0, l4, l5);
+ l1 = svclamp(l1, l4, l5);
+ l2 = svclamp(l2, l4, l5);
+ l3 = svclamp(l3, l4, l5);
+ svst1_vnum(pg, dest, 0,
+ svcreate4(svget4(l0, 0), svget4(l1, 0),
+ svget4(l2, 0), svget4(l3, 0)));
+ svst1_vnum(pg, dest, 4,
+ svcreate4(svget4(l0, 1), svget4(l1, 1),
+ svget4(l2, 1), svget4(l3, 1)));
+ svst1_vnum(pg, dest, 8,
+ svcreate4(svget4(l0, 2), svget4(l1, 2),
+ svget4(l2, 2), svget4(l3, 2)));
+ svst1_vnum(pg, dest, 12,
+ svcreate4(svget4(l0, 3), svget4(l1, 3),
+ svget4(l2, 3), svget4(l3, 3)));
+ svst1_vnum(pg, dest, 16,
+ svcreate4(svget4(l0, 0), svget4(l1, 0),
+ svget4(l2, 0), svget4(l3, 0)));
+ svst1_vnum(pg, dest, 20,
+ svcreate4(svget4(l0, 1), svget4(l1, 1),
+ svget4(l2, 1), svget4(l3, 1)));
+ svst1_vnum(pg, dest, 24,
+ svcreate4(svget4(l0, 2), svget4(l1, 2),
+ svget4(l2, 2), svget4(l3, 2)));
+ svst1_vnum(pg, dest, 28,
+ svcreate4(svget4(l0, 3), svget4(l1, 3),
+ svget4(l2, 3), svget4(l3, 3)));
+ l0 = svld1_vnum_x4(pg, src3, 0);
+ l1 = svld1_vnum_x4(pg, src3, 4);
+ l2 = svld1_vnum_x4(pg, src3, 8);
+ l3 = svld1_vnum_x4(pg, src3, 12);
+ l0 = svclamp(l0, l4, l5);
+ l1 = svclamp(l1, l4, l5);
+ l2 = svclamp(l2, l4, l5);
+ l3 = svclamp(l3, l4, l5);
+ svst1_vnum(pg, dest, 32,
+ svcreate4(svget4(l0, 0), svget4(l1, 0),
+ svget4(l2, 0), svget4(l3, 0)));
+ svst1_vnum(pg, dest, 36,
+ svcreate4(svget4(l0, 1), svget4(l1, 1),
+ svget4(l2, 1), svget4(l3, 1)));
+ svst1_vnum(pg, dest, 40,
+ svcreate4(svget4(l0, 2), svget4(l1, 2),
+ svget4(l2, 2), svget4(l3, 2)));
+ svst1_vnum(pg, dest, 44,
+ svcreate4(svget4(l0, 3), svget4(l1, 3),
+ svget4(l2, 3), svget4(l3, 3)));
+ svst1_vnum(pg, dest, 48,
+ svcreate4(svget4(l0, 0), svget4(l1, 0),
+ svget4(l2, 0), svget4(l3, 0)));
+ svst1_vnum(pg, dest, 52,
+ svcreate4(svget4(l0, 1), svget4(l1, 1),
+ svget4(l2, 1), svget4(l3, 1)));
+ svst1_vnum(pg, dest, 56,
+ svcreate4(svget4(l0, 2), svget4(l1, 2),
+ svget4(l2, 2), svget4(l3, 2)));
+ svst1_vnum(pg, dest, 60,
+ svcreate4(svget4(l0, 3), svget4(l1, 3),
+ svget4(l2, 3), svget4(l3, 3)));
+}
+
+/* { dg-final { scan-assembler-not {\tmov\tz} } } */