@@ -22,6 +22,7 @@ INSERT_PASS_BEFORE (pass_sched, 1, pass_aarch64_early_ra);
INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_pstate_sm);
+INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_late_track_speculation);
INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
@@ -1075,6 +1075,7 @@ std::string aarch64_get_extension_string_for_isa_flags (aarch64_feature_flags,
rtl_opt_pass *make_pass_aarch64_early_ra (gcc::context *);
rtl_opt_pass *make_pass_fma_steering (gcc::context *);
rtl_opt_pass *make_pass_track_speculation (gcc::context *);
+rtl_opt_pass *make_pass_late_track_speculation (gcc::context *);
rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
@@ -105,6 +105,7 @@ (define_insn "aarch64_get_sme_state"
(clobber (reg:CC CC_REGNUM))]
""
"bl\t__arm_sme_state"
+ [(set_attr "is_call" "yes")]
)
(define_insn "aarch64_read_svcr"
@@ -242,6 +243,7 @@ (define_insn "aarch64_tpidr2_save"
(clobber (reg:CC CC_REGNUM))]
""
"bl\t__arm_tpidr2_save"
+ [(set_attr "is_call" "yes")]
)
;; Set PSTATE.ZA to 1. If ZA was previously dormant or active,
@@ -358,6 +360,7 @@ (define_insn "aarch64_tpidr2_restore"
(clobber (reg:CC CC_REGNUM))]
""
"bl\t__arm_tpidr2_restore"
+ [(set_attr "is_call" "yes")]
)
;; Check whether a lazy save set up by aarch64_save_za was committed
@@ -39,10 +39,10 @@
#include "insn-config.h"
#include "recog.h"
-/* This pass scans the RTL just before the final branch
- re-organisation pass. The aim is to identify all places where
- there is conditional control flow and to insert code that tracks
- any speculative execution of a conditional branch.
+/* This pass scans the RTL insns late in the RTL pipeline. The aim is
+ to identify all places where there is conditional control flow and
+ to insert code that tracks any speculative execution of a conditional
+ branch.
To do this we reserve a call-clobbered register (so that it can be
initialized very early in the function prologue) that can then be
@@ -131,11 +131,22 @@
carry the tracking state in SP for this period of time unless the
tracker value is needed at that point in time.
- We run the pass just before the final branch reorganization pass so
- that we can handle most of the conditional branch cases using the
- standard edge insertion code. The reorg pass will hopefully clean
- things up for afterwards so that the results aren't too
- horrible. */
+ We run the pass while the CFG is still present so that we can handle
+ most of the conditional branch cases using the standard edge insertion
+ code. Where possible, we prefer to run the pass just before the final
+ branch reorganization pass. That pass will then hopefully clean things
+ up afterwards so that the results aren't too horrible.
+
+ However, we must run the pass after all conditional branches have
+ been inserted. switch_pstate_sm inserts conditional branches for
+ streaming-compatible code, and so for streaming-compatible functions,
+ this pass must run after that one.
+
+ We handle this by having two copies of the pass: the normal one that
+ runs before branch reorganization, and a "late" one that runs just
+ before late_thread_prologue_and_epilogue. The two passes have
+ mutually exclusive gates, with the normal pass being chosen wherever
+ possible. */
/* Generate a code sequence to clobber SP if speculating incorreclty. */
static rtx_insn *
@@ -315,11 +326,15 @@ aarch64_do_track_speculation ()
needs_tracking = true;
}
- if (CALL_P (insn))
+ if (CALL_P (insn)
+ || (NONDEBUG_INSN_P (insn)
+ && recog_memoized (insn) >= 0
+ && get_attr_is_call (insn) == IS_CALL_YES))
{
bool tailcall
- = (SIBLING_CALL_P (insn)
- || find_reg_note (insn, REG_NORETURN, NULL_RTX));
+ = (CALL_P (insn)
+ && (SIBLING_CALL_P (insn)
+ || find_reg_note (insn, REG_NORETURN, NULL_RTX)));
/* Tailcalls are like returns, we can eliminate the
transfer between the tracker register and SP if we
@@ -461,21 +476,28 @@ const pass_data pass_data_aarch64_track_speculation =
class pass_track_speculation : public rtl_opt_pass
{
- public:
- pass_track_speculation(gcc::context *ctxt)
- : rtl_opt_pass(pass_data_aarch64_track_speculation, ctxt)
- {}
+public:
+ pass_track_speculation(gcc::context *ctxt, bool is_late)
+ : rtl_opt_pass(pass_data_aarch64_track_speculation, ctxt),
+ is_late (is_late)
+ {}
/* opt_pass methods: */
virtual bool gate (function *)
{
- return aarch64_track_speculation;
+ return (aarch64_track_speculation
+ && (is_late == bool (TARGET_STREAMING_COMPATIBLE)));
}
virtual unsigned int execute (function *)
{
return aarch64_do_track_speculation ();
}
+
+private:
+ /* Whether this is the late pass that runs before late prologue/epilogue
+ insertion, or the normal pass that runs before branch reorganization. */
+ bool is_late;
}; // class pass_track_speculation.
} // anon namespace.
@@ -483,5 +505,11 @@ class pass_track_speculation : public rtl_opt_pass
rtl_opt_pass *
make_pass_track_speculation (gcc::context *ctxt)
{
- return new pass_track_speculation (ctxt);
+ return new pass_track_speculation (ctxt, /*is_late=*/false);
+}
+
+rtl_opt_pass *
+make_pass_late_track_speculation (gcc::context *ctxt)
+{
+ return new pass_track_speculation (ctxt, /*is_late=*/true);
}
@@ -2659,6 +2659,27 @@ aarch64_gen_compare_zero_and_branch (rtx_code code, rtx x,
return gen_rtx_SET (pc_rtx, x);
}
+/* Return an rtx that branches to LABEL based on the value of bit BITNUM of X.
+ If CODE is NE, it branches to LABEL when the bit is set; if CODE is EQ,
+ it branches to LABEL when the bit is clear. */
+
+static rtx
+aarch64_gen_test_and_branch (rtx_code code, rtx x, int bitnum,
+ rtx_code_label *label)
+{
+ auto mode = GET_MODE (x);
+ if (aarch64_track_speculation)
+ {
+ auto mask = gen_int_mode (HOST_WIDE_INT_1U << bitnum, mode);
+ emit_insn (gen_aarch64_and3nr_compare0 (mode, x, mask));
+ rtx cc_reg = gen_rtx_REG (CC_NZVmode, CC_REGNUM);
+ rtx x = gen_rtx_fmt_ee (code, CC_NZVmode, cc_reg, const0_rtx);
+ return gen_condjump (x, cc_reg, label);
+ }
+ return gen_aarch64_tb (code, mode, mode,
+ x, gen_int_mode (bitnum, mode), label);
+}
+
/* Consider the operation:
OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
@@ -4881,8 +4902,9 @@ aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
gcc_assert (local_mode != 0);
auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
auto *label = gen_label_rtx ();
- auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode, DImode,
- old_svcr, const0_rtx, label));
+ auto branch = aarch64_gen_test_and_branch (already_ok_cond, old_svcr, 0,
+ label);
+ auto *jump = emit_jump_insn (branch);
JUMP_LABEL (jump) = label;
return label;
}
@@ -439,6 +439,12 @@ (define_enum "arches" [any rcpc8_4 fp fp_q base_simd nobase_simd
(define_enum_attr "arch" "arches" (const_string "any"))
+;; Whether a normal INSN in fact contains a call. Sometimes we represent
+;; calls to functions that use an ad-hoc ABI as normal insns, both for
+;; optimization reasons and to avoid the need to describe the ABI to
+;; target-independent code.
+(define_attr "is_call" "no,yes" (const_string "no"))
+
;; [For compatibility with Arm in pipeline models]
;; Attribute that specifies whether or not the instruction touches fp
;; registers.
@@ -5395,7 +5401,7 @@ (define_insn "*ands<GPI:mode>_compare0"
[(set_attr "type" "alus_imm")]
)
-(define_insn "*and<mode>3nr_compare0"
+(define_insn "@aarch64_and<mode>3nr_compare0"
[(set (reg:CC_NZV CC_REGNUM)
(compare:CC_NZV
(and:GPI (match_operand:GPI 0 "register_operand")
new file mode 100644
@@ -0,0 +1,209 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -funwind-tables -mtrack-speculation" }
+// { dg-final { check-function-bodies "**" "" } }
+
+void ns_callee ();
+ void s_callee () [[arm::streaming]];
+ void sc_callee () [[arm::streaming_compatible]];
+
+void ns_callee_stack (int, int, int, int, int, int, int, int, int);
+
+struct callbacks {
+ void (*ns_ptr) ();
+ void (*s_ptr) () [[arm::streaming]];
+ void (*sc_ptr) () [[arm::streaming_compatible]];
+};
+
+/*
+** sc_caller_sme:
+** cmp sp, #?0
+** csetm x15, ne
+** stp x29, x30, \[sp, #?-96\]!
+** mov x29, sp
+** cntd x16
+** str x16, \[sp, #?24\]
+** stp d8, d9, \[sp, #?32\]
+** stp d10, d11, \[sp, #?48\]
+** stp d12, d13, \[sp, #?64\]
+** stp d14, d15, \[sp, #?80\]
+** mrs x16, svcr
+** str x16, \[x29, #?16\]
+** ldr x16, \[x29, #?16\]
+** tst x16, #?1
+** beq [^\n]*
+** csel x15, x15, xzr, ne
+** smstop sm
+** b [^\n]*
+** csel x15, x15, xzr, eq
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl ns_callee
+** cmp sp, #?0
+** csetm x15, ne
+** ldr x16, \[x29, #?16\]
+** tst x16, #?1
+** beq [^\n]*
+** csel x15, x15, xzr, ne
+** smstart sm
+** b [^\n]*
+** csel x15, x15, xzr, eq
+** ldr x16, \[x29, #?16\]
+** tst x16, #?1
+** bne [^\n]*
+** csel x15, x15, xzr, eq
+** smstart sm
+** b [^\n]*
+** csel x15, x15, xzr, ne
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl s_callee
+** cmp sp, #?0
+** csetm x15, ne
+** ldr x16, \[x29, #?16\]
+** tst x16, #?1
+** bne [^\n]*
+** csel x15, x15, xzr, eq
+** smstop sm
+** b [^\n]*
+** csel x15, x15, xzr, ne
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl sc_callee
+** cmp sp, #?0
+** csetm x15, ne
+** ldp d8, d9, \[sp, #?32\]
+** ldp d10, d11, \[sp, #?48\]
+** ldp d12, d13, \[sp, #?64\]
+** ldp d14, d15, \[sp, #?80\]
+** ldp x29, x30, \[sp\], #?96
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** ret
+*/
+void
+sc_caller_sme () [[arm::streaming_compatible]]
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+}
+
+#pragma GCC target "+nosme"
+
+/*
+** sc_caller:
+** cmp sp, #?0
+** csetm x15, ne
+** stp x29, x30, \[sp, #?-96\]!
+** mov x29, sp
+** cntd x16
+** str x16, \[sp, #?24\]
+** stp d8, d9, \[sp, #?32\]
+** stp d10, d11, \[sp, #?48\]
+** stp d12, d13, \[sp, #?64\]
+** stp d14, d15, \[sp, #?80\]
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl __arm_sme_state
+** cmp sp, #?0
+** csetm x15, ne
+** str x0, \[x29, #?16\]
+** ...
+** bl sc_callee
+** cmp sp, #?0
+** csetm x15, ne
+** ldp d8, d9, \[sp, #?32\]
+** ldp d10, d11, \[sp, #?48\]
+** ldp d12, d13, \[sp, #?64\]
+** ldp d14, d15, \[sp, #?80\]
+** ldp x29, x30, \[sp\], #?96
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** ret
+*/
+void
+sc_caller () [[arm::streaming_compatible]]
+{
+ ns_callee ();
+ sc_callee ();
+}
+
+/*
+** sc_caller_x0:
+** ...
+** mov x10, x0
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl __arm_sme_state
+** ...
+** str wzr, \[x10\]
+** ...
+*/
+void
+sc_caller_x0 (int *ptr) [[arm::streaming_compatible]]
+{
+ *ptr = 0;
+ ns_callee ();
+ sc_callee ();
+}
+
+/*
+** sc_caller_x1:
+** ...
+** mov x10, x0
+** mov x11, x1
+** mov x14, sp
+** and x14, x14, x15
+** mov sp, x14
+** bl __arm_sme_state
+** ...
+** str w11, \[x10\]
+** ...
+*/
+void
+sc_caller_x1 (int *ptr, int a) [[arm::streaming_compatible]]
+{
+ *ptr = a;
+ ns_callee ();
+ sc_callee ();
+}
+
+/*
+** sc_caller_stack:
+** cmp sp, #?0
+** csetm x15, ne
+** sub sp, sp, #112
+** stp x29, x30, \[sp, #?16\]
+** add x29, sp, #?16
+** ...
+** stp d8, d9, \[sp, #?48\]
+** ...
+** bl __arm_sme_state
+** cmp sp, #?0
+** csetm x15, ne
+** str x0, \[x29, #?16\]
+** ...
+** bl ns_callee_stack
+** cmp sp, #?0
+** csetm x15, ne
+** ldr x16, \[x29, #?16\]
+** tst x16, #?1
+** beq [^\n]*
+** csel x15, x15, xzr, ne
+** smstart sm
+** ...
+*/
+void
+sc_caller_stack () [[arm::streaming_compatible]]
+{
+ ns_callee_stack (0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/* { dg-final { scan-assembler {sc_caller_sme:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */
+/* { dg-final { scan-assembler {sc_caller:(?:(?!ret).)*\.cfi_offset 46, -72\n} } } */