@@ -20,6 +20,7 @@
INSERT_PASS_AFTER (pass_regrename, 1, pass_fma_steering);
INSERT_PASS_BEFORE (pass_reorder_blocks, 1, pass_track_speculation);
+INSERT_PASS_BEFORE (pass_late_thread_prologue_and_epilogue, 1, pass_switch_sm_state);
INSERT_PASS_AFTER (pass_machine_reorg, 1, pass_tag_collision_avoidance);
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_bti);
INSERT_PASS_AFTER (pass_if_after_combine, 1, pass_cc_fusion);
@@ -910,7 +910,7 @@ void aarch64_sve_expand_vector_init (rtx, rtx);
void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
const_tree, unsigned, bool = false);
void aarch64_init_expanders (void);
-void aarch64_emit_call_insn (rtx);
+rtx_insn *aarch64_emit_call_insn (rtx);
void aarch64_register_pragmas (void);
void aarch64_relayout_simd_types (void);
void aarch64_reset_previous_fndecl (void);
@@ -1051,6 +1051,7 @@ rtl_opt_pass *make_pass_track_speculation (gcc::context *);
rtl_opt_pass *make_pass_tag_collision_avoidance (gcc::context *);
rtl_opt_pass *make_pass_insert_bti (gcc::context *ctxt);
rtl_opt_pass *make_pass_cc_fusion (gcc::context *ctxt);
+rtl_opt_pass *make_pass_switch_sm_state (gcc::context *ctxt);
poly_uint64 aarch64_regmode_natural_size (machine_mode);
new file mode 100644
@@ -0,0 +1,133 @@
+;; Machine description for AArch64 SME.
+;; Copyright (C) 2022 Free Software Foundation, Inc.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3. If not see
+;; <http://www.gnu.org/licenses/>.
+
+;; The file is organised into the following sections (search for the full
+;; line):
+;;
+;; == State management
+;; ---- Test current state
+;; ---- PSTATE.SM management
+
+;; =========================================================================
+;; == State management
+;; =========================================================================
+
+;; -------------------------------------------------------------------------
+;; ---- Test current state
+;; -------------------------------------------------------------------------
+
+(define_c_enum "unspec" [
+ UNSPEC_GET_SME_STATE
+ UNSPEC_READ_SVCR
+])
+
+(define_insn "aarch64_get_sme_state"
+ [(set (reg:TI R0_REGNUM)
+ (unspec_volatile:TI [(const_int 0)] UNSPEC_GET_SME_STATE))
+ (clobber (reg:DI R16_REGNUM))
+ (clobber (reg:DI R17_REGNUM))
+ (clobber (reg:DI R18_REGNUM))
+ (clobber (reg:DI R30_REGNUM))
+ (clobber (reg:CC CC_REGNUM))]
+ ""
+ "bl\t__arm_sme_state"
+)
+
+(define_insn "aarch64_read_svcr"
+ [(set (match_operand:DI 0 "register_operand" "=r")
+ (unspec_volatile:DI [(const_int 0)] UNSPEC_READ_SVCR))]
+ "TARGET_SME"
+ "mrs\t%0, svcr"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- PSTATE.SM management
+;; -------------------------------------------------------------------------
+;; Includes
+;; - SMSTART SM
+;; - SMSTOP SM
+;; -------------------------------------------------------------------------
+
+(define_c_enum "unspec" [
+ UNSPEC_SMSTART_SM
+ UNSPEC_SMSTOP_SM
+])
+
+;; Doesn't depend on a TARGET_* since (a) the instruction is always
+;; emitted under direct control of aarch64 code and (b) it is sometimes
+;; used conditionally.
+(define_insn "aarch64_smstart_sm"
+ [(unspec_volatile [(const_int 0)] UNSPEC_SMSTART_SM)
+ (clobber (reg:V4x16QI V0_REGNUM))
+ (clobber (reg:V4x16QI V4_REGNUM))
+ (clobber (reg:V4x16QI V8_REGNUM))
+ (clobber (reg:V4x16QI V12_REGNUM))
+ (clobber (reg:V4x16QI V16_REGNUM))
+ (clobber (reg:V4x16QI V20_REGNUM))
+ (clobber (reg:V4x16QI V24_REGNUM))
+ (clobber (reg:V4x16QI V28_REGNUM))
+ (clobber (reg:VNx16BI P0_REGNUM))
+ (clobber (reg:VNx16BI P1_REGNUM))
+ (clobber (reg:VNx16BI P2_REGNUM))
+ (clobber (reg:VNx16BI P3_REGNUM))
+ (clobber (reg:VNx16BI P4_REGNUM))
+ (clobber (reg:VNx16BI P5_REGNUM))
+ (clobber (reg:VNx16BI P6_REGNUM))
+ (clobber (reg:VNx16BI P7_REGNUM))
+ (clobber (reg:VNx16BI P8_REGNUM))
+ (clobber (reg:VNx16BI P9_REGNUM))
+ (clobber (reg:VNx16BI P10_REGNUM))
+ (clobber (reg:VNx16BI P11_REGNUM))
+ (clobber (reg:VNx16BI P12_REGNUM))
+ (clobber (reg:VNx16BI P13_REGNUM))
+ (clobber (reg:VNx16BI P14_REGNUM))
+ (clobber (reg:VNx16BI P15_REGNUM))]
+ ""
+ "smstart\tsm"
+)
+
+(define_insn "aarch64_smstop_sm"
+ [(unspec_volatile [(const_int 0)] UNSPEC_SMSTOP_SM)
+ (clobber (reg:V4x16QI V0_REGNUM))
+ (clobber (reg:V4x16QI V4_REGNUM))
+ (clobber (reg:V4x16QI V8_REGNUM))
+ (clobber (reg:V4x16QI V12_REGNUM))
+ (clobber (reg:V4x16QI V16_REGNUM))
+ (clobber (reg:V4x16QI V20_REGNUM))
+ (clobber (reg:V4x16QI V24_REGNUM))
+ (clobber (reg:V4x16QI V28_REGNUM))
+ (clobber (reg:VNx16BI P0_REGNUM))
+ (clobber (reg:VNx16BI P1_REGNUM))
+ (clobber (reg:VNx16BI P2_REGNUM))
+ (clobber (reg:VNx16BI P3_REGNUM))
+ (clobber (reg:VNx16BI P4_REGNUM))
+ (clobber (reg:VNx16BI P5_REGNUM))
+ (clobber (reg:VNx16BI P6_REGNUM))
+ (clobber (reg:VNx16BI P7_REGNUM))
+ (clobber (reg:VNx16BI P8_REGNUM))
+ (clobber (reg:VNx16BI P9_REGNUM))
+ (clobber (reg:VNx16BI P10_REGNUM))
+ (clobber (reg:VNx16BI P11_REGNUM))
+ (clobber (reg:VNx16BI P12_REGNUM))
+ (clobber (reg:VNx16BI P13_REGNUM))
+ (clobber (reg:VNx16BI P14_REGNUM))
+ (clobber (reg:VNx16BI P15_REGNUM))]
+ ""
+ "smstop\tsm"
+)
@@ -82,6 +82,8 @@
#include "tree-dfa.h"
#include "asan.h"
#include "aarch64-feature-deps.h"
+#include "tree-pass.h"
+#include "cfgbuild.h"
/* This file should be included last. */
#include "target-def.h"
@@ -4103,6 +4105,26 @@ aarch64_fndecl_isa_mode (const_tree fndecl)
return aarch64_fndecl_sm_state (fndecl);
}
+/* Return the state of PSTATE.SM on entry to the current function.
+ This might be different from the state of PSTATE.SM in the function
+ body. */
+
+static aarch64_feature_flags
+aarch64_cfun_incoming_sm_state ()
+{
+ return aarch64_fntype_sm_state (TREE_TYPE (cfun->decl));
+}
+
+/* Return true if a call from the current function to a function with
+ ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
+ the BL instruction. */
+
+static bool
+aarch64_call_switches_sm_state (aarch64_feature_flags callee_mode)
+{
+ return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
+}
+
/* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
static bool
@@ -4185,6 +4207,16 @@ aarch64_callee_abi (rtx cookie)
return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
}
+/* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx. Return the
+ required ISA mode on entry to the callee, which is also the ISA
+ mode on return from the callee. */
+
+static aarch64_feature_flags
+aarch64_callee_isa_mode (rtx cookie)
+{
+ return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
+}
+
/* INSN is a call instruction. Return the CONST_INT stored in its
UNSPEC_CALLEE_ABI rtx. */
@@ -4207,6 +4239,15 @@ aarch64_insn_callee_abi (const rtx_insn *insn)
return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
}
+/* INSN is a call instruction. Return the required ISA mode on entry to
+ the callee, which is also the ISA mode on return from the callee. */
+
+static aarch64_feature_flags
+aarch64_insn_callee_isa_mode (const rtx_insn *insn)
+{
+ return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
+}
+
/* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
the lower 64 bits of a 128-bit register. Tell the compiler the callee
clobbers the top 64 bits when restoring the bottom 64 bits. */
@@ -6394,6 +6435,428 @@ aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
temp1, temp2, frame_related_p, emit_move_imm);
}
+/* A streaming-compatible function needs to switch temporarily to the known
+ PSTATE.SM mode described by LOCAL_MODE. The low bit of OLD_SVCR contains
+ the runtime state of PSTATE.SM in the streaming-compatible code, before
+ the start of the switch to LOCAL_MODE.
+
+ Emit instructions to branch around the mode switch if PSTATE.SM already
+ matches LOCAL_MODE. Return the label that the branch jumps to. */
+
+static rtx_insn *
+aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
+{
+ local_mode &= AARCH64_FL_SM_STATE;
+ gcc_assert (local_mode != 0);
+ auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
+ auto *label = gen_label_rtx ();
+ auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode,
+ old_svcr, const0_rtx, label));
+ JUMP_LABEL (jump) = label;
+ return label;
+}
+
+/* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
+ state in NEW_MODE. This is known to involve either an SMSTART SM or
+ an SMSTOP SM. */
+
+static void
+aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
+ aarch64_feature_flags new_mode)
+{
+ old_mode &= AARCH64_FL_SM_STATE;
+ new_mode &= AARCH64_FL_SM_STATE;
+ gcc_assert (old_mode != new_mode);
+
+ if ((new_mode & AARCH64_FL_SM_ON)
+ || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
+ emit_insn (gen_aarch64_smstart_sm ());
+ else
+ emit_insn (gen_aarch64_smstop_sm ());
+}
+
+/* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
+ FP and predicate registers. This class emits code to preserve any
+ necessary registers around the mode switch.
+
+ The class uses four approaches to saving and restoring contents, enumerated
+ by group_type:
+
+ - GPR: save and restore the contents of FP registers using GPRs.
+ This is used if the FP register contains no more than 64 significant
+ bits. The registers used are FIRST_GPR onwards.
+
+ - MEM_128: save and restore 128-bit SIMD registers using memory.
+
+ - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
+
+ - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
+
+ The save slots within each memory group are consecutive, with the
+ MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
+
+ There will only be two mode switches for each use of SME, so they should
+ not be particularly performance-sensitive. It's also rare for SIMD, SVE
+ or predicate registers to be live across mode switches. We therefore
+ don't preallocate the save slots but instead allocate them locally on
+ demand. This makes the code emitted by the class self-contained. */
+
+class aarch64_sme_mode_switch_regs
+{
+public:
+ static const unsigned int FIRST_GPR = R10_REGNUM;
+
+ void add_reg (machine_mode, unsigned int);
+ void add_call_args (rtx_call_insn *);
+ void add_call_result (rtx_call_insn *);
+
+ void emit_prologue ();
+ void emit_epilogue ();
+
+ /* The number of GPRs needed to save FP registers, starting from
+ FIRST_GPR. */
+ unsigned int num_gprs () { return m_group_count[GPR]; }
+
+private:
+ enum sequence { PROLOGUE, EPILOGUE };
+ enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
+
+ /* Information about the save location for one FP, SIMD, SVE data, or
+ SVE predicate register. */
+ struct save_location {
+ /* The register to be saved. */
+ rtx reg;
+
+ /* Which group the save location belongs to. */
+ group_type group;
+
+ /* A zero-based index of the register within the group. */
+ unsigned int index;
+ };
+
+ unsigned int sve_data_headroom ();
+ rtx get_slot_mem (machine_mode, poly_int64);
+ void emit_stack_adjust (sequence, poly_int64);
+ void emit_mem_move (sequence, const save_location &, poly_int64);
+
+ void emit_gpr_moves (sequence);
+ void emit_mem_128_moves (sequence);
+ void emit_sve_sp_adjust (sequence);
+ void emit_sve_pred_moves (sequence);
+ void emit_sve_data_moves (sequence);
+
+ /* All save locations, in no particular order. */
+ auto_vec<save_location, 12> m_save_locations;
+
+ /* The number of registers in each group. */
+ unsigned int m_group_count[NUM_GROUPS] = {};
+};
+
+/* Record that (reg:MODE REGNO) needs to be preserved around the mode
+ switch. */
+
+void
+aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
+{
+ if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
+ return;
+
+ unsigned int end_regno = end_hard_regno (mode, regno);
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
+ for (; regno < end_regno; regno++)
+ {
+ machine_mode submode = mode;
+ if (vec_flags & VEC_STRUCT)
+ {
+ if (vec_flags & VEC_SVE_DATA)
+ submode = SVE_BYTE_MODE;
+ else if (vec_flags & VEC_PARTIAL)
+ submode = V8QImode;
+ else
+ submode = V16QImode;
+ }
+ save_location loc;
+ loc.reg = gen_rtx_REG (submode, regno);
+ if (vec_flags == VEC_SVE_PRED)
+ {
+ gcc_assert (PR_REGNUM_P (regno));
+ loc.group = MEM_SVE_PRED;
+ }
+ else
+ {
+ gcc_assert (FP_REGNUM_P (regno));
+ if (known_le (GET_MODE_SIZE (submode), 8))
+ loc.group = GPR;
+ else if (known_eq (GET_MODE_SIZE (submode), 16))
+ loc.group = MEM_128;
+ else
+ loc.group = MEM_SVE_DATA;
+ }
+ loc.index = m_group_count[loc.group]++;
+ m_save_locations.quick_push (loc);
+ }
+}
+
+/* Record that the arguments to CALL_INSN need to be preserved around
+ the mode switch. */
+
+void
+aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
+{
+ for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
+ node; node = XEXP (node, 1))
+ {
+ rtx item = XEXP (node, 0);
+ if (GET_CODE (item) != USE)
+ continue;
+ item = XEXP (item, 0);
+ if (!REG_P (item))
+ continue;
+ add_reg (GET_MODE (item), REGNO (item));
+ }
+}
+
+/* Record that the return value from CALL_INSN (if any) needs to be
+ preserved around the mode switch. */
+
+void
+aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
+{
+ rtx pat = PATTERN (call_insn);
+ gcc_assert (GET_CODE (pat) == PARALLEL);
+ pat = XVECEXP (pat, 0, 0);
+ if (GET_CODE (pat) == CALL)
+ return;
+ rtx dest = SET_DEST (pat);
+ add_reg (GET_MODE (dest), REGNO (dest));
+}
+
+/* Emit code to save registers before the mode switch. */
+
+void
+aarch64_sme_mode_switch_regs::emit_prologue ()
+{
+ emit_sve_sp_adjust (PROLOGUE);
+ emit_sve_pred_moves (PROLOGUE);
+ emit_sve_data_moves (PROLOGUE);
+ emit_mem_128_moves (PROLOGUE);
+ emit_gpr_moves (PROLOGUE);
+}
+
+/* Emit code to restore registers after the mode switch. */
+
+void
+aarch64_sme_mode_switch_regs::emit_epilogue ()
+{
+ emit_gpr_moves (EPILOGUE);
+ emit_mem_128_moves (EPILOGUE);
+ emit_sve_pred_moves (EPILOGUE);
+ emit_sve_data_moves (EPILOGUE);
+ emit_sve_sp_adjust (EPILOGUE);
+}
+
+/* The SVE predicate registers are stored below the SVE data registers,
+ with the predicate save area being padded to a data-register-sized
+ boundary. Return the size of this padded area as a whole number
+ of data register slots. */
+
+unsigned int
+aarch64_sme_mode_switch_regs::sve_data_headroom ()
+{
+ return CEIL (m_group_count[MEM_SVE_PRED], 8);
+}
+
+/* Return a memory reference of mode MODE to OFFSET bytes from the
+ stack pointer. */
+
+rtx
+aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
+ poly_int64 offset)
+{
+ rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
+ return gen_rtx_MEM (mode, addr);
+}
+
+/* Allocate or deallocate SIZE bytes of stack space: SEQ decides which. */
+
+void
+aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
+ poly_int64 size)
+{
+ if (seq == PROLOGUE)
+ size = -size;
+ emit_insn (gen_rtx_SET (stack_pointer_rtx,
+ plus_constant (Pmode, stack_pointer_rtx, size)));
+}
+
+/* Save or restore the register in LOC, whose slot is OFFSET bytes from
+ the stack pointer. SEQ chooses between saving and restoring. */
+
+void
+aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
+ const save_location &loc,
+ poly_int64 offset)
+{
+ rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
+ if (seq == PROLOGUE)
+ emit_move_insn (mem, loc.reg);
+ else
+ emit_move_insn (loc.reg, mem);
+}
+
+/* Emit instructions to save or restore the GPR group. SEQ chooses between
+ saving and restoring. */
+
+void
+aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
+{
+ for (auto &loc : m_save_locations)
+ if (loc.group == GPR)
+ {
+ gcc_assert (loc.index < 8);
+ rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
+ if (seq == PROLOGUE)
+ emit_move_insn (gpr, loc.reg);
+ else
+ emit_move_insn (loc.reg, gpr);
+ }
+}
+
+/* Emit instructions to save or restore the MEM_128 group. SEQ chooses
+ between saving and restoring. */
+
+void
+aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
+{
+ HOST_WIDE_INT count = m_group_count[MEM_128];
+ if (count == 0)
+ return;
+
+ auto sp = stack_pointer_rtx;
+ auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
+
+ /* Pick a common mode that supports LDR & STR with pre/post-modification
+ and LDP & STP with pre/post-modification. */
+ auto mode = TFmode;
+
+ /* An instruction pattern that should be emitted at the end. */
+ rtx last_pat = NULL_RTX;
+
+ /* A previous MEM_128 location that hasn't been handled yet. */
+ save_location *prev_loc = nullptr;
+
+ /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC. */
+ for (auto &loc : m_save_locations)
+ if (loc.group == MEM_128)
+ {
+ if (!prev_loc)
+ {
+ prev_loc = &loc;
+ continue;
+ }
+ gcc_assert (loc.index == prev_loc->index + 1);
+
+ /* The offset of the base of the save area from the current
+ stack pointer. */
+ HOST_WIDE_INT bias = 0;
+ if (prev_loc->index == 0 && seq == PROLOGUE)
+ bias = sp_adjust;
+
+ /* Get the two sets in the LDP/STP. */
+ rtx ops[] = {
+ gen_rtx_REG (mode, REGNO (prev_loc->reg)),
+ get_slot_mem (mode, prev_loc->index * 16 + bias),
+ gen_rtx_REG (mode, REGNO (loc.reg)),
+ get_slot_mem (mode, loc.index * 16 + bias)
+ };
+ unsigned int lhs = (seq == PROLOGUE);
+ rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
+ rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
+
+ /* Combine the sets with any stack allocation/deallocation. */
+ rtvec vec;
+ if (prev_loc->index == 0)
+ {
+ rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
+ vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
+ }
+ else
+ vec = gen_rtvec (2, set1, set2);
+ rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
+
+ /* Queue a deallocation to the end, otherwise emit the
+ instruction now. */
+ if (seq == EPILOGUE && prev_loc->index == 0)
+ last_pat = pat;
+ else
+ emit_insn (pat);
+ prev_loc = nullptr;
+ }
+
+ /* Handle any leftover LDR/STR. */
+ if (prev_loc)
+ {
+ rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
+ rtx addr;
+ if (prev_loc->index != 0)
+ addr = plus_constant (Pmode, sp, prev_loc->index * 16);
+ else if (seq == PROLOGUE)
+ {
+ rtx allocate = plus_constant (Pmode, sp, -count * 16);
+ addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
+ }
+ else
+ {
+ rtx deallocate = plus_constant (Pmode, sp, count * 16);
+ addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
+ }
+ rtx mem = gen_rtx_MEM (mode, addr);
+ if (seq == PROLOGUE)
+ emit_move_insn (mem, reg);
+ else
+ emit_move_insn (reg, mem);
+ }
+
+ if (last_pat)
+ emit_insn (last_pat);
+}
+
+/* Allocate or deallocate the stack space needed by the SVE groups.
+ SEQ chooses between allocating and deallocating. */
+
+void
+aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
+{
+ if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
+ emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
+}
+
+/* Save or restore the MEM_SVE_DATA group. SEQ chooses between saving
+ and restoring. */
+
+void
+aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
+{
+ for (auto &loc : m_save_locations)
+ if (loc.group == MEM_SVE_DATA)
+ {
+ auto index = loc.index + sve_data_headroom ();
+ emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
+ }
+}
+
+/* Save or restore the MEM_SVE_PRED group. SEQ chooses between saving
+ and restoring. */
+
+void
+aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
+{
+ for (auto &loc : m_save_locations)
+ if (loc.group == MEM_SVE_PRED)
+ emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
+}
+
/* Set DEST to (vec_series BASE STEP). */
static void
@@ -7934,6 +8397,40 @@ on_stack:
return;
}
+/* Add the current argument register to the set of those that need
+ to be saved and restored around a change to PSTATE.SM. */
+
+static void
+aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
+{
+ subrtx_var_iterator::array_type array;
+ FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
+ {
+ rtx x = *iter;
+ if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
+ {
+ unsigned int i = pcum->num_sme_mode_switch_args++;
+ gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
+ pcum->sme_mode_switch_args[i] = x;
+ }
+ }
+}
+
+/* Return a parallel that contains all the registers that need to be
+ saved around a change to PSTATE.SM. Return const0_rtx if there is
+ no such mode switch, or if no registers need to be saved. */
+
+static rtx
+aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
+{
+ if (!pcum->num_sme_mode_switch_args)
+ return const0_rtx;
+
+ auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
+ pcum->sme_mode_switch_args);
+ return gen_rtx_PARALLEL (VOIDmode, argvec);
+}
+
/* Implement TARGET_FUNCTION_ARG. */
static rtx
@@ -7945,7 +8442,13 @@ aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
|| pcum->pcs_variant == ARM_PCS_SVE);
if (arg.end_marker_p ())
- return aarch64_gen_callee_cookie (pcum->isa_mode, pcum->pcs_variant);
+ {
+ rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
+ pcum->pcs_variant);
+ rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
+ return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, abi_cookie,
+ sme_mode_switch_args));
+ }
aarch64_layout_arg (pcum_v, arg);
return pcum->aapcs_reg;
@@ -7980,6 +8483,7 @@ aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
pcum->aapcs_stack_words = 0;
pcum->aapcs_stack_size = 0;
pcum->silent_p = silent_p;
+ pcum->num_sme_mode_switch_args = 0;
if (!silent_p
&& !TARGET_FLOAT
@@ -8020,6 +8524,10 @@ aarch64_function_arg_advance (cumulative_args_t pcum_v,
aarch64_layout_arg (pcum_v, arg);
gcc_assert ((pcum->aapcs_reg != NULL_RTX)
!= (pcum->aapcs_stack_words != 0));
+ if (pcum->aapcs_reg
+ && aarch64_call_switches_sm_state (pcum->isa_mode))
+ aarch64_record_sme_mode_switch_args (pcum);
+
pcum->aapcs_arg_processed = false;
pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
@@ -8457,6 +8965,30 @@ aarch64_needs_frame_chain (void)
return aarch64_use_frame_pointer;
}
+/* Return true if the current function needs to record the incoming
+ value of PSTATE.SM. */
+static bool
+aarch64_need_old_pstate_sm ()
+{
+ /* Exit early if the incoming value of PSTATE.SM is known at
+ compile time. */
+ if (aarch64_cfun_incoming_sm_state () != 0)
+ return false;
+
+ if (cfun->machine->call_switches_sm_state)
+ for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ if (auto *call = dyn_cast<rtx_call_insn *> (insn))
+ if (!SIBLING_CALL_P (call))
+ {
+ /* Return true if there is call to a non-streaming-compatible
+ function. */
+ auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
+ if (aarch64_call_switches_sm_state (callee_isa_mode))
+ return true;
+ }
+ return false;
+}
+
/* Mark the registers that need to be saved by the callee and calculate
the size of the callee-saved registers area and frame record (both FP
and LR may be omitted). */
@@ -8486,6 +9018,7 @@ aarch64_layout_frame (void)
/* First mark all the registers that really need to be saved... */
for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
+ frame.old_svcr_offset = SLOT_NOT_REQUIRED;
/* ... that includes the eh data registers (if needed)... */
if (crtl->calls_eh_return)
@@ -8612,6 +9145,12 @@ aarch64_layout_frame (void)
offset += UNITS_PER_WORD;
}
+ if (aarch64_need_old_pstate_sm ())
+ {
+ frame.old_svcr_offset = offset;
+ offset += UNITS_PER_WORD;
+ }
+
poly_int64 max_int_offset = offset;
offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
bool has_align_gap = maybe_ne (offset, max_int_offset);
@@ -9908,6 +10447,48 @@ aarch64_epilogue_uses (int regno)
return 0;
}
+/* The current function's frame has a save slot for the incoming state
+ of SVCR. Return a legitimate memory for the slot, based on the hard
+ frame pointer. */
+
+static rtx
+aarch64_old_svcr_mem ()
+{
+ gcc_assert (frame_pointer_needed
+ && known_ge (cfun->machine->frame.old_svcr_offset, 0));
+ rtx base = hard_frame_pointer_rtx;
+ poly_int64 offset = (/* hard fp -> top of frame. */
+ cfun->machine->frame.hard_fp_offset
+ /* top of frame -> bottom of frame. */
+ - cfun->machine->frame.frame_size
+ /* bottom of frame -> save slot. */
+ + cfun->machine->frame.old_svcr_offset);
+ return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
+}
+
+/* The current function's frame has a save slot for the incoming state
+ of SVCR. Load the slot into register REGNO and return the register. */
+
+static rtx
+aarch64_read_old_svcr (unsigned int regno)
+{
+ rtx svcr = gen_rtx_REG (DImode, regno);
+ emit_move_insn (svcr, aarch64_old_svcr_mem ());
+ return svcr;
+}
+
+/* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
+ load the incoming value of SVCR from its save slot into temporary
+ register REGNO. */
+
+static rtx_insn *
+aarch64_guard_switch_pstate_sm (unsigned int regno,
+ aarch64_feature_flags local_mode)
+{
+ rtx old_svcr = aarch64_read_old_svcr (regno);
+ return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
+}
+
/* AArch64 stack frames generated by this compiler look like:
+-------------------------------+
@@ -10141,6 +10722,40 @@ aarch64_expand_prologue (void)
that is assumed by the called. */
aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
!frame_pointer_needed, true);
+
+ /* Save the incoming value of PSTATE.SM, if required. */
+ if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
+ {
+ rtx mem = aarch64_old_svcr_mem ();
+ MEM_VOLATILE_P (mem) = 1;
+ if (TARGET_SME)
+ {
+ rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
+ emit_insn (gen_aarch64_read_svcr (reg));
+ emit_move_insn (mem, reg);
+ }
+ else
+ {
+ rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
+ auto &args = crtl->args.info;
+ if (args.aapcs_ncrn > 0)
+ {
+ old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
+ emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
+ }
+ if (args.aapcs_ncrn > 1)
+ {
+ old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
+ emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
+ }
+ emit_insn (gen_aarch64_get_sme_state ());
+ emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
+ if (old_r0)
+ emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
+ if (old_r1)
+ emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
+ }
+ }
}
/* Return TRUE if we can use a simple_return insn.
@@ -11395,17 +12010,33 @@ aarch64_start_call_args (cumulative_args_t ca_v)
RESULT is the register in which the result is returned. It's NULL for
"call" and "sibcall".
MEM is the location of the function call.
- CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
+ COOKIE is either:
+ - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
+ - a PARALLEL that contains such a const_int as its first element.
+ The second element is a PARALLEL that lists all the argument
+ registers that need to be saved and restored around a change
+ in PSTATE.SM, or const0_rtx if no such switch is needed.
SIBCALL indicates whether this function call is normal call or sibling call.
It will generate different pattern accordingly. */
void
-aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
+aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
{
rtx call, callee, tmp;
rtvec vec;
machine_mode mode;
+ rtx callee_abi = cookie;
+ rtx sme_mode_switch_args = const0_rtx;
+ if (GET_CODE (cookie) == PARALLEL)
+ {
+ callee_abi = XVECEXP (cookie, 0, 0);
+ sme_mode_switch_args = XVECEXP (cookie, 0, 1);
+ }
+
+ gcc_assert (CONST_INT_P (callee_abi));
+ auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
+
gcc_assert (MEM_P (mem));
callee = XEXP (mem, 0);
mode = GET_MODE (callee);
@@ -11430,26 +12061,67 @@ aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
else
tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
- gcc_assert (CONST_INT_P (callee_abi));
callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
UNSPEC_CALLEE_ABI);
vec = gen_rtvec (3, call, callee_abi, tmp);
call = gen_rtx_PARALLEL (VOIDmode, vec);
- aarch64_emit_call_insn (call);
+ auto call_insn = aarch64_emit_call_insn (call);
+
+ /* Check whether the call requires a change to PSTATE.SM. We can't
+ emit the instructions to change PSTATE.SM yet, since they involve
+ a change in vector length and a change in instruction set, which
+ cannot be represented in RTL.
+
+ For now, just record which registers will be clobbered by the
+ changes to PSTATE.SM. */
+ if (!sibcall && aarch64_call_switches_sm_state (callee_isa_mode))
+ {
+ aarch64_sme_mode_switch_regs args_switch;
+ if (sme_mode_switch_args != const0_rtx)
+ {
+ unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
+ for (unsigned int i = 0; i < num_args; ++i)
+ {
+ rtx x = XVECEXP (sme_mode_switch_args, 0, i);
+ args_switch.add_reg (GET_MODE (x), REGNO (x));
+ }
+ }
+
+ aarch64_sme_mode_switch_regs result_switch;
+ if (result)
+ result_switch.add_reg (GET_MODE (result), REGNO (result));
+
+ unsigned int num_gprs = MAX (args_switch.num_gprs (),
+ result_switch.num_gprs ());
+ for (unsigned int i = 0; i < num_gprs; ++i)
+ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
+ gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
+
+ for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
+ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
+ gen_rtx_REG (V4x16QImode, regno));
+
+ for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
+ clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
+ gen_rtx_REG (VNx16BImode, regno));
+
+ cfun->machine->call_switches_sm_state = true;
+ }
}
/* Emit call insn with PAT and do aarch64-specific handling. */
-void
+rtx_insn *
aarch64_emit_call_insn (rtx pat)
{
- rtx insn = emit_call_insn (pat);
+ auto insn = emit_call_insn (pat);
rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
+ return insn;
}
machine_mode
@@ -12761,6 +13433,16 @@ aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
return false;
}
+/* Implement TARGET_FRAME_POINTER_REQUIRED. */
+
+static bool
+aarch64_frame_pointer_required ()
+{
+ /* If the function needs to record the incoming value of PSTATE.SM,
+ make sure that the slot is accessible from the frame pointer. */
+ return aarch64_need_old_pstate_sm ();
+}
+
static bool
aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
{
@@ -27496,6 +28178,122 @@ aarch64_indirect_call_asm (rtx addr)
return "";
}
+/* If CALL involves a change in PSTATE.SM, emit the instructions needed
+ to switch to the new mode and the instructions needed to restore the
+ original mode. Return true if something changed. */
+static bool
+aarch64_switch_sm_state_for_call (rtx_call_insn *call)
+{
+ /* Mode switches for sibling calls are handled via the epilogue. */
+ if (SIBLING_CALL_P (call))
+ return false;
+
+ auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
+ if (!aarch64_call_switches_sm_state (callee_isa_mode))
+ return false;
+
+ /* Switch mode before the call, preserving any argument registers
+ across the switch. */
+ start_sequence ();
+ rtx_insn *args_guard_label = nullptr;
+ if (TARGET_STREAMING_COMPATIBLE)
+ args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+ callee_isa_mode);
+ aarch64_sme_mode_switch_regs args_switch;
+ args_switch.add_call_args (call);
+ args_switch.emit_prologue ();
+ aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
+ args_switch.emit_epilogue ();
+ if (args_guard_label)
+ emit_label (args_guard_label);
+ auto args_seq = get_insns ();
+ end_sequence ();
+ emit_insn_before (args_seq, call);
+
+ if (find_reg_note (call, REG_NORETURN, NULL_RTX))
+ return true;
+
+ /* Switch mode after the call, preserving any return registers across
+ the switch. */
+ start_sequence ();
+ rtx_insn *return_guard_label = nullptr;
+ if (TARGET_STREAMING_COMPATIBLE)
+ return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
+ callee_isa_mode);
+ aarch64_sme_mode_switch_regs return_switch;
+ return_switch.add_call_result (call);
+ return_switch.emit_prologue ();
+ aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
+ return_switch.emit_epilogue ();
+ if (return_guard_label)
+ emit_label (return_guard_label);
+ auto result_seq = get_insns ();
+ end_sequence ();
+ emit_insn_after (result_seq, call);
+ return true;
+}
+
+namespace {
+
+const pass_data pass_data_switch_sm_state =
+{
+ RTL_PASS, // type
+ "smstarts", // name
+ OPTGROUP_NONE, // optinfo_flags
+ TV_NONE, // tv_id
+ 0, // properties_required
+ 0, // properties_provided
+ 0, // properties_destroyed
+ 0, // todo_flags_start
+ TODO_df_finish, // todo_flags_finish
+};
+
+class pass_switch_sm_state : public rtl_opt_pass
+{
+public:
+ pass_switch_sm_state (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_switch_sm_state, ctxt)
+ {}
+
+ // opt_pass methods:
+ bool gate (function *) override final;
+ unsigned int execute (function *) override final;
+};
+
+bool
+pass_switch_sm_state::gate (function *)
+{
+ return cfun->machine->call_switches_sm_state;
+}
+
+/* Emit any instructions needed to switch PSTATE.SM. */
+unsigned int
+pass_switch_sm_state::execute (function *fn)
+{
+ basic_block bb;
+
+ auto_sbitmap blocks (last_basic_block_for_fn (cfun));
+ bitmap_clear (blocks);
+ FOR_EACH_BB_FN (bb, fn)
+ {
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (auto *call = dyn_cast<rtx_call_insn *> (insn))
+ if (aarch64_switch_sm_state_for_call (call))
+ bitmap_set_bit (blocks, bb->index);
+ }
+ find_many_sub_basic_blocks (blocks);
+ return 0;
+}
+
+}
+
+rtl_opt_pass *
+make_pass_switch_sm_state (gcc::context *ctxt)
+{
+ return new pass_switch_sm_state (ctxt);
+}
+
/* Target-specific selftests. */
#if CHECKING_P
@@ -27683,6 +28481,9 @@ aarch64_run_selftests (void)
#undef TARGET_CALLEE_COPIES
#define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
+#undef TARGET_FRAME_POINTER_REQUIRED
+#define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
+
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE aarch64_can_eliminate
@@ -255,6 +255,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
/* The current function is a normal non-streaming function. */
#define TARGET_NON_STREAMING (AARCH64_ISA_SM_OFF)
+/* The current function has a streaming-compatible body. */
+#define TARGET_STREAMING_COMPATIBLE \
+ ((aarch64_isa_flags & AARCH64_FL_SM_STATE) == 0)
+
/* Crypto is an optional extension to AdvSIMD. */
#define TARGET_CRYPTO (AARCH64_ISA_CRYPTO)
@@ -304,6 +308,10 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
/* SVE2 SM4 instructions, enabled through +sve2-sm4. */
#define TARGET_SVE2_SM4 (AARCH64_ISA_SVE2_SM4 && TARGET_NON_STREAMING)
+/* SME instructions, enabled through +sme. Note that this does not
+ imply anything about the state of PSTATE.SM. */
+#define TARGET_SME (AARCH64_ISA_SME)
+
/* ARMv8.3-A features. */
#define TARGET_ARMV8_3 (AARCH64_ISA_V8_3A)
@@ -802,6 +810,13 @@ struct GTY (()) aarch64_frame
STACK_BOUNDARY. */
poly_int64 locals_offset;
+ /* The offset from the base of the frame of a 64-bit slot whose low
+ bit contains the incoming value of PSTATE.SM. This slot must be
+ within reach of the hard frame pointer.
+
+ The offset is -1 if such a slot isn't needed. */
+ poly_int64 old_svcr_offset;
+
/* Offset from the base of the frame (incomming SP) to the
hard_frame_pointer. This value is always a multiple of
STACK_BOUNDARY. */
@@ -884,6 +899,10 @@ typedef struct GTY (()) machine_function
/* One entry for each general purpose register. */
rtx call_via[SP_REGNUM];
bool label_is_assembled;
+ /* True if we've expanded at least one call to a function that changes
+ PSTATE.SM. This should only be used for saving compile time: false
+ guarantees that no such mode switch exists. */
+ bool call_switches_sm_state;
} machine_function;
#endif
@@ -948,6 +967,12 @@ typedef struct
stack arg area so far. */
bool silent_p; /* True if we should act silently, rather than
raise an error for invalid calls. */
+
+ /* A list of registers that need to be saved and restored around a
+ change to PSTATE.SM. An auto_vec would be more convenient, but those
+ can't be copied. */
+ unsigned int num_sme_mode_switch_args;
+ rtx sme_mode_switch_args[12];
} CUMULATIVE_ARGS;
#endif
@@ -940,7 +940,7 @@ (define_insn "*cb<optab><mode>1"
(const_int 1)))]
)
-(define_insn "*tb<optab><mode>1"
+(define_insn "@aarch64_tb<optab><mode>"
[(set (pc) (if_then_else
(EQL (zero_extract:DI (match_operand:GPI 0 "register_operand" "r")
(const_int 1)
@@ -1027,7 +1027,7 @@ (define_expand "call"
[(parallel
[(call (match_operand 0 "memory_operand")
(match_operand 1 "general_operand"))
- (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+ (unspec:DI [(match_operand 2)] UNSPEC_CALLEE_ABI)
(clobber (reg:DI LR_REGNUM))])]
""
"
@@ -1053,7 +1053,7 @@ (define_expand "call_value"
[(set (match_operand 0 "")
(call (match_operand 1 "memory_operand")
(match_operand 2 "general_operand")))
- (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+ (unspec:DI [(match_operand 3)] UNSPEC_CALLEE_ABI)
(clobber (reg:DI LR_REGNUM))])]
""
"
@@ -1080,7 +1080,7 @@ (define_expand "sibcall"
[(parallel
[(call (match_operand 0 "memory_operand")
(match_operand 1 "general_operand"))
- (unspec:DI [(match_operand 2 "const_int_operand")] UNSPEC_CALLEE_ABI)
+ (unspec:DI [(match_operand 2)] UNSPEC_CALLEE_ABI)
(return)])]
""
{
@@ -1094,7 +1094,7 @@ (define_expand "sibcall_value"
[(set (match_operand 0 "")
(call (match_operand 1 "memory_operand")
(match_operand 2 "general_operand")))
- (unspec:DI [(match_operand 3 "const_int_operand")] UNSPEC_CALLEE_ABI)
+ (unspec:DI [(match_operand 3)] UNSPEC_CALLEE_ABI)
(return)])]
""
{
@@ -7783,3 +7783,6 @@ (define_insn "st64bv0"
;; SVE2.
(include "aarch64-sve2.md")
+
+;; SME and extensions
+(include "aarch64-sme.md")
@@ -186,7 +186,8 @@ MULTILIB_DIRNAMES = $(subst $(comma), ,$(TM_MULTILIB_CONFIG))
insn-conditions.md: s-check-sve-md
s-check-sve-md: $(srcdir)/config/aarch64/check-sve-md.awk \
$(srcdir)/config/aarch64/aarch64-sve.md \
- $(srcdir)/config/aarch64/aarch64-sve2.md
+ $(srcdir)/config/aarch64/aarch64-sve2.md \
+ $(srcdir)/config/aarch64/aarch64-sme.md
$(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \
$(srcdir)/config/aarch64/aarch64-sve.md
$(AWK) -f $(srcdir)/config/aarch64/check-sve-md.awk \
new file mode 100644
@@ -0,0 +1,195 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+void ns_callee ();
+__attribute__((arm_streaming)) void s_callee ();
+__attribute__((arm_streaming_compatible)) void sc_callee ();
+
+struct callbacks {
+ void (*ns_ptr) ();
+ __attribute__((arm_streaming)) void (*s_ptr) ();
+ __attribute__((arm_streaming_compatible)) void (*sc_ptr) ();
+};
+
+/*
+** n_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-80\]!
+** stp d8, d9, \[sp, #?16\]
+** stp d10, d11, \[sp, #?32\]
+** stp d12, d13, \[sp, #?48\]
+** stp d14, d15, \[sp, #?64\]
+** mov \1, x0
+** bl ns_callee
+** smstart sm
+** bl s_callee
+** smstop sm
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** blr \2
+** ldr (x[0-9]+), \[\1, #?8\]
+** smstart sm
+** blr \3
+** smstop sm
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldp d8, d9, \[sp, #?16\]
+** ldp d10, d11, \[sp, #?32\]
+** ldp d12, d13, \[sp, #?48\]
+** ldp d14, d15, \[sp, #?64\]
+** ldp \1, x30, \[sp\], #?80
+** ret
+*/
+void
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+/*
+** s_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-80\]!
+** stp d8, d9, \[sp, #?16\]
+** stp d10, d11, \[sp, #?32\]
+** stp d12, d13, \[sp, #?48\]
+** stp d14, d15, \[sp, #?64\]
+** mov \1, x0
+** smstop sm
+** bl ns_callee
+** smstart sm
+** bl s_callee
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** smstop sm
+** blr \2
+** smstart sm
+** ldr (x[0-9]+), \[\1, #?8\]
+** blr \3
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldp d8, d9, \[sp, #?16\]
+** ldp d10, d11, \[sp, #?32\]
+** ldp d12, d13, \[sp, #?48\]
+** ldp d14, d15, \[sp, #?64\]
+** ldp \1, x30, \[sp\], #?80
+** ret
+*/
+void __attribute__((arm_streaming))
+s_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+/*
+** sc_caller_sme:
+** stp x29, x30, \[sp, #?-96\]!
+** mov x29, sp
+** stp d8, d9, \[sp, #?32\]
+** stp d10, d11, \[sp, #?48\]
+** stp d12, d13, \[sp, #?64\]
+** stp d14, d15, \[sp, #?80\]
+** mrs x16, svcr
+** str x16, \[x29, #?16\]
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** smstop sm
+** bl ns_callee
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** smstart sm
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** smstart sm
+** bl s_callee
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** smstop sm
+** bl sc_callee
+** ldp d8, d9, \[sp, #?32\]
+** ldp d10, d11, \[sp, #?48\]
+** ldp d12, d13, \[sp, #?64\]
+** ldp d14, d15, \[sp, #?80\]
+** ldp x29, x30, \[sp\], #?96
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+sc_caller_sme ()
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+}
+
+#pragma GCC target "+nosme"
+
+/*
+** sc_caller:
+** stp x29, x30, \[sp, #?-96\]!
+** mov x29, sp
+** stp d8, d9, \[sp, #?32\]
+** stp d10, d11, \[sp, #?48\]
+** stp d12, d13, \[sp, #?64\]
+** stp d14, d15, \[sp, #?80\]
+** bl __arm_sme_state
+** str x0, \[x29, #?16\]
+** ...
+** bl sc_callee
+** ldp d8, d9, \[sp, #?32\]
+** ldp d10, d11, \[sp, #?48\]
+** ldp d12, d13, \[sp, #?64\]
+** ldp d14, d15, \[sp, #?80\]
+** ldp x29, x30, \[sp\], #?96
+** ret
+*/
+void __attribute__((arm_streaming_compatible))
+sc_caller ()
+{
+ ns_callee ();
+ sc_callee ();
+}
+
+/*
+** sc_caller_x0:
+** ...
+** mov x10, x0
+** bl __arm_sme_state
+** ...
+** str wzr, \[x10\]
+** ...
+*/
+void __attribute__((arm_streaming_compatible))
+sc_caller_x0 (int *ptr)
+{
+ *ptr = 0;
+ ns_callee ();
+ sc_callee ();
+}
+
+/*
+** sc_caller_x1:
+** ...
+** mov x10, x0
+** mov x11, x1
+** bl __arm_sme_state
+** ...
+** str w11, \[x10\]
+** ...
+*/
+void __attribute__((arm_streaming_compatible))
+sc_caller_x1 (int *ptr, int a)
+{
+ *ptr = a;
+ ns_callee ();
+ sc_callee ();
+}
new file mode 100644
@@ -0,0 +1,37 @@
+// { dg-options "" }
+
+#pragma GCC target "+nosme"
+
+void ns_callee ();
+__attribute__((arm_streaming)) void s_callee ();
+__attribute__((arm_streaming_compatible)) void sc_callee ();
+
+struct callbacks {
+ void (*ns_ptr) ();
+ __attribute__((arm_streaming)) void (*s_ptr) ();
+ __attribute__((arm_streaming_compatible)) void (*sc_ptr) ();
+};
+
+void
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" }
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" }
+ c->sc_ptr ();
+}
+
+void __attribute__((arm_streaming_compatible))
+sc_caller_sme (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" }
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr (); // { dg-error "calling a streaming function requires the ISA extension 'sme'" }
+ c->sc_ptr ();
+}
new file mode 100644
@@ -0,0 +1,43 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+
+void ns_callee ();
+__attribute__((arm_streaming)) void s_callee ();
+__attribute__((arm_streaming_compatible)) void sc_callee ();
+
+struct callbacks {
+ void (*ns_ptr) ();
+ __attribute__((arm_streaming)) void (*s_ptr) ();
+ __attribute__((arm_streaming_compatible)) void (*sc_ptr) ();
+};
+
+void
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->sc_ptr ();
+}
+
+void __attribute__((arm_streaming))
+s_caller (struct callbacks *c)
+{
+ s_callee ();
+ sc_callee ();
+
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+void __attribute__((arm_streaming_compatible))
+sc_caller (struct callbacks *c)
+{
+ sc_callee ();
+
+ c->sc_ptr ();
+}
+
+// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } }
+// { dg-final { scan-assembler-not {smstart\tsm} } }
+// { dg-final { scan-assembler-not {smstop\tsm} } }
new file mode 100644
@@ -0,0 +1,156 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+__attribute__((aarch64_vector_pcs)) void ns_callee ();
+__attribute__((arm_streaming, aarch64_vector_pcs)) void s_callee ();
+__attribute__((arm_streaming_compatible, aarch64_vector_pcs)) void sc_callee ();
+
+struct callbacks {
+ __attribute__((aarch64_vector_pcs)) void (*ns_ptr) ();
+ __attribute__((arm_streaming, aarch64_vector_pcs)) void (*s_ptr) ();
+ __attribute__((arm_streaming_compatible, aarch64_vector_pcs)) void (*sc_ptr) ();
+};
+
+/*
+** n_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-272\]!
+** stp q8, q9, \[sp, #?16\]
+** stp q10, q11, \[sp, #?48\]
+** stp q12, q13, \[sp, #?80\]
+** stp q14, q15, \[sp, #?112\]
+** stp q16, q17, \[sp, #?144\]
+** stp q18, q19, \[sp, #?176\]
+** stp q20, q21, \[sp, #?208\]
+** stp q22, q23, \[sp, #?240\]
+** mov \1, x0
+** bl ns_callee
+** smstart sm
+** bl s_callee
+** smstop sm
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** blr \2
+** ldr (x[0-9]+), \[\1, #?8\]
+** smstart sm
+** blr \3
+** smstop sm
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldp q8, q9, \[sp, #?16\]
+** ldp q10, q11, \[sp, #?48\]
+** ldp q12, q13, \[sp, #?80\]
+** ldp q14, q15, \[sp, #?112\]
+** ldp q16, q17, \[sp, #?144\]
+** ldp q18, q19, \[sp, #?176\]
+** ldp q20, q21, \[sp, #?208\]
+** ldp q22, q23, \[sp, #?240\]
+** ldp \1, x30, \[sp\], #?272
+** ret
+*/
+void __attribute__((aarch64_vector_pcs))
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+/*
+** s_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-272\]!
+** stp q8, q9, \[sp, #?16\]
+** stp q10, q11, \[sp, #?48\]
+** stp q12, q13, \[sp, #?80\]
+** stp q14, q15, \[sp, #?112\]
+** stp q16, q17, \[sp, #?144\]
+** stp q18, q19, \[sp, #?176\]
+** stp q20, q21, \[sp, #?208\]
+** stp q22, q23, \[sp, #?240\]
+** mov \1, x0
+** smstop sm
+** bl ns_callee
+** smstart sm
+** bl s_callee
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** smstop sm
+** blr \2
+** smstart sm
+** ldr (x[0-9]+), \[\1, #?8\]
+** blr \3
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldp q8, q9, \[sp, #?16\]
+** ldp q10, q11, \[sp, #?48\]
+** ldp q12, q13, \[sp, #?80\]
+** ldp q14, q15, \[sp, #?112\]
+** ldp q16, q17, \[sp, #?144\]
+** ldp q18, q19, \[sp, #?176\]
+** ldp q20, q21, \[sp, #?208\]
+** ldp q22, q23, \[sp, #?240\]
+** ldp \1, x30, \[sp\], #?272
+** ret
+*/
+void __attribute__((arm_streaming, aarch64_vector_pcs))
+s_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+/*
+** sc_caller:
+** stp x29, x30, \[sp, #?-288\]!
+** mov x29, sp
+** stp q8, q9, \[sp, #?32\]
+** stp q10, q11, \[sp, #?64\]
+** stp q12, q13, \[sp, #?96\]
+** stp q14, q15, \[sp, #?128\]
+** stp q16, q17, \[sp, #?160\]
+** stp q18, q19, \[sp, #?192\]
+** stp q20, q21, \[sp, #?224\]
+** stp q22, q23, \[sp, #?256\]
+** mrs x16, svcr
+** str x16, \[x29, #?16\]
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** smstop sm
+** bl ns_callee
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** smstart sm
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** smstart sm
+** bl s_callee
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** smstop sm
+** bl sc_callee
+** ldp q8, q9, \[sp, #?32\]
+** ldp q10, q11, \[sp, #?64\]
+** ldp q12, q13, \[sp, #?96\]
+** ldp q14, q15, \[sp, #?128\]
+** ldp q16, q17, \[sp, #?160\]
+** ldp q18, q19, \[sp, #?192\]
+** ldp q20, q21, \[sp, #?224\]
+** ldp q22, q23, \[sp, #?256\]
+** ldp x29, x30, \[sp\], #?288
+** ret
+*/
+void __attribute__((arm_streaming_compatible, aarch64_vector_pcs))
+sc_caller ()
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+}
new file mode 100644
@@ -0,0 +1,43 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+
+__attribute__((aarch64_vector_pcs)) void ns_callee ();
+__attribute__((aarch64_vector_pcs, arm_streaming)) void s_callee ();
+__attribute__((aarch64_vector_pcs, arm_streaming_compatible)) void sc_callee ();
+
+struct callbacks {
+ __attribute__((aarch64_vector_pcs)) void (*ns_ptr) ();
+ __attribute__((aarch64_vector_pcs, arm_streaming)) void (*s_ptr) ();
+ __attribute__((aarch64_vector_pcs, arm_streaming_compatible)) void (*sc_ptr) ();
+};
+
+void __attribute__((aarch64_vector_pcs))
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->sc_ptr ();
+}
+
+void __attribute__((aarch64_vector_pcs, arm_streaming))
+s_caller (struct callbacks *c)
+{
+ s_callee ();
+ sc_callee ();
+
+ c->s_ptr ();
+ c->sc_ptr ();
+}
+
+void __attribute__((aarch64_vector_pcs, arm_streaming_compatible))
+sc_caller (struct callbacks *c)
+{
+ sc_callee ();
+
+ c->sc_ptr ();
+}
+
+// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } }
+// { dg-final { scan-assembler-not {smstart\tsm} } }
+// { dg-final { scan-assembler-not {smstop\tsm} } }
new file mode 100644
@@ -0,0 +1,308 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sve.h>
+
+svbool_t ns_callee ();
+__attribute__((arm_streaming)) svbool_t s_callee ();
+__attribute__((arm_streaming_compatible)) svbool_t sc_callee ();
+
+struct callbacks {
+ svbool_t (*ns_ptr) ();
+ __attribute__((arm_streaming)) svbool_t (*s_ptr) ();
+ __attribute__((arm_streaming_compatible)) svbool_t (*sc_ptr) ();
+};
+
+/*
+** n_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-16\]!
+** addvl sp, sp, #-18
+** str p4, \[sp\]
+** str p5, \[sp, #1, mul vl\]
+** str p6, \[sp, #2, mul vl\]
+** str p7, \[sp, #3, mul vl\]
+** str p8, \[sp, #4, mul vl\]
+** str p9, \[sp, #5, mul vl\]
+** str p10, \[sp, #6, mul vl\]
+** str p11, \[sp, #7, mul vl\]
+** str p12, \[sp, #8, mul vl\]
+** str p13, \[sp, #9, mul vl\]
+** str p14, \[sp, #10, mul vl\]
+** str p15, \[sp, #11, mul vl\]
+** str z8, \[sp, #2, mul vl\]
+** str z9, \[sp, #3, mul vl\]
+** str z10, \[sp, #4, mul vl\]
+** str z11, \[sp, #5, mul vl\]
+** str z12, \[sp, #6, mul vl\]
+** str z13, \[sp, #7, mul vl\]
+** str z14, \[sp, #8, mul vl\]
+** str z15, \[sp, #9, mul vl\]
+** str z16, \[sp, #10, mul vl\]
+** str z17, \[sp, #11, mul vl\]
+** str z18, \[sp, #12, mul vl\]
+** str z19, \[sp, #13, mul vl\]
+** str z20, \[sp, #14, mul vl\]
+** str z21, \[sp, #15, mul vl\]
+** str z22, \[sp, #16, mul vl\]
+** str z23, \[sp, #17, mul vl\]
+** mov \1, x0
+** bl ns_callee
+** smstart sm
+** bl s_callee
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** blr \2
+** ldr (x[0-9]+), \[\1, #?8\]
+** smstart sm
+** blr \3
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldr z8, \[sp, #2, mul vl\]
+** ldr z9, \[sp, #3, mul vl\]
+** ldr z10, \[sp, #4, mul vl\]
+** ldr z11, \[sp, #5, mul vl\]
+** ldr z12, \[sp, #6, mul vl\]
+** ldr z13, \[sp, #7, mul vl\]
+** ldr z14, \[sp, #8, mul vl\]
+** ldr z15, \[sp, #9, mul vl\]
+** ldr z16, \[sp, #10, mul vl\]
+** ldr z17, \[sp, #11, mul vl\]
+** ldr z18, \[sp, #12, mul vl\]
+** ldr z19, \[sp, #13, mul vl\]
+** ldr z20, \[sp, #14, mul vl\]
+** ldr z21, \[sp, #15, mul vl\]
+** ldr z22, \[sp, #16, mul vl\]
+** ldr z23, \[sp, #17, mul vl\]
+** ldr p4, \[sp\]
+** ldr p5, \[sp, #1, mul vl\]
+** ldr p6, \[sp, #2, mul vl\]
+** ldr p7, \[sp, #3, mul vl\]
+** ldr p8, \[sp, #4, mul vl\]
+** ldr p9, \[sp, #5, mul vl\]
+** ldr p10, \[sp, #6, mul vl\]
+** ldr p11, \[sp, #7, mul vl\]
+** ldr p12, \[sp, #8, mul vl\]
+** ldr p13, \[sp, #9, mul vl\]
+** ldr p14, \[sp, #10, mul vl\]
+** ldr p15, \[sp, #11, mul vl\]
+** addvl sp, sp, #18
+** ldp \1, x30, \[sp\], #?16
+** ret
+*/
+svbool_t
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ return c->sc_ptr ();
+}
+
+/*
+** s_caller: { target lp64 }
+** stp (x19|x2[0-8]), x30, \[sp, #?-16\]!
+** addvl sp, sp, #-18
+** str p4, \[sp\]
+** str p5, \[sp, #1, mul vl\]
+** str p6, \[sp, #2, mul vl\]
+** str p7, \[sp, #3, mul vl\]
+** str p8, \[sp, #4, mul vl\]
+** str p9, \[sp, #5, mul vl\]
+** str p10, \[sp, #6, mul vl\]
+** str p11, \[sp, #7, mul vl\]
+** str p12, \[sp, #8, mul vl\]
+** str p13, \[sp, #9, mul vl\]
+** str p14, \[sp, #10, mul vl\]
+** str p15, \[sp, #11, mul vl\]
+** str z8, \[sp, #2, mul vl\]
+** str z9, \[sp, #3, mul vl\]
+** str z10, \[sp, #4, mul vl\]
+** str z11, \[sp, #5, mul vl\]
+** str z12, \[sp, #6, mul vl\]
+** str z13, \[sp, #7, mul vl\]
+** str z14, \[sp, #8, mul vl\]
+** str z15, \[sp, #9, mul vl\]
+** str z16, \[sp, #10, mul vl\]
+** str z17, \[sp, #11, mul vl\]
+** str z18, \[sp, #12, mul vl\]
+** str z19, \[sp, #13, mul vl\]
+** str z20, \[sp, #14, mul vl\]
+** str z21, \[sp, #15, mul vl\]
+** str z22, \[sp, #16, mul vl\]
+** str z23, \[sp, #17, mul vl\]
+** mov \1, x0
+** smstop sm
+** bl ns_callee
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** bl s_callee
+** bl sc_callee
+** ldr (x[0-9]+), \[\1\]
+** smstop sm
+** blr \2
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** ldr (x[0-9]+), \[\1, #?8\]
+** blr \3
+** ldr (x[0-9]+), \[\1, #?16\]
+** blr \4
+** ldr z8, \[sp, #2, mul vl\]
+** ldr z9, \[sp, #3, mul vl\]
+** ldr z10, \[sp, #4, mul vl\]
+** ldr z11, \[sp, #5, mul vl\]
+** ldr z12, \[sp, #6, mul vl\]
+** ldr z13, \[sp, #7, mul vl\]
+** ldr z14, \[sp, #8, mul vl\]
+** ldr z15, \[sp, #9, mul vl\]
+** ldr z16, \[sp, #10, mul vl\]
+** ldr z17, \[sp, #11, mul vl\]
+** ldr z18, \[sp, #12, mul vl\]
+** ldr z19, \[sp, #13, mul vl\]
+** ldr z20, \[sp, #14, mul vl\]
+** ldr z21, \[sp, #15, mul vl\]
+** ldr z22, \[sp, #16, mul vl\]
+** ldr z23, \[sp, #17, mul vl\]
+** ldr p4, \[sp\]
+** ldr p5, \[sp, #1, mul vl\]
+** ldr p6, \[sp, #2, mul vl\]
+** ldr p7, \[sp, #3, mul vl\]
+** ldr p8, \[sp, #4, mul vl\]
+** ldr p9, \[sp, #5, mul vl\]
+** ldr p10, \[sp, #6, mul vl\]
+** ldr p11, \[sp, #7, mul vl\]
+** ldr p12, \[sp, #8, mul vl\]
+** ldr p13, \[sp, #9, mul vl\]
+** ldr p14, \[sp, #10, mul vl\]
+** ldr p15, \[sp, #11, mul vl\]
+** addvl sp, sp, #18
+** ldp \1, x30, \[sp\], #?16
+** ret
+*/
+svbool_t __attribute__((arm_streaming))
+s_caller (struct callbacks *c)
+{
+ ns_callee ();
+ s_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ c->s_ptr ();
+ return c->sc_ptr ();
+}
+
+/*
+** sc_caller:
+** stp x29, x30, \[sp, #?-32\]!
+** mov x29, sp
+** addvl sp, sp, #-18
+** str p4, \[sp\]
+** str p5, \[sp, #1, mul vl\]
+** str p6, \[sp, #2, mul vl\]
+** str p7, \[sp, #3, mul vl\]
+** str p8, \[sp, #4, mul vl\]
+** str p9, \[sp, #5, mul vl\]
+** str p10, \[sp, #6, mul vl\]
+** str p11, \[sp, #7, mul vl\]
+** str p12, \[sp, #8, mul vl\]
+** str p13, \[sp, #9, mul vl\]
+** str p14, \[sp, #10, mul vl\]
+** str p15, \[sp, #11, mul vl\]
+** str z8, \[sp, #2, mul vl\]
+** str z9, \[sp, #3, mul vl\]
+** str z10, \[sp, #4, mul vl\]
+** str z11, \[sp, #5, mul vl\]
+** str z12, \[sp, #6, mul vl\]
+** str z13, \[sp, #7, mul vl\]
+** str z14, \[sp, #8, mul vl\]
+** str z15, \[sp, #9, mul vl\]
+** str z16, \[sp, #10, mul vl\]
+** str z17, \[sp, #11, mul vl\]
+** str z18, \[sp, #12, mul vl\]
+** str z19, \[sp, #13, mul vl\]
+** str z20, \[sp, #14, mul vl\]
+** str z21, \[sp, #15, mul vl\]
+** str z22, \[sp, #16, mul vl\]
+** str z23, \[sp, #17, mul vl\]
+** mrs x16, svcr
+** str x16, \[x29, #?16\]
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** smstop sm
+** bl ns_callee
+** ldr x16, \[x29, #?16\]
+** tbz x16, 0, .*
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** smstart sm
+** bl s_callee
+** ldr x16, \[x29, #?16\]
+** tbnz x16, 0, .*
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** bl sc_callee
+** ldr z8, \[sp, #2, mul vl\]
+** ldr z9, \[sp, #3, mul vl\]
+** ldr z10, \[sp, #4, mul vl\]
+** ldr z11, \[sp, #5, mul vl\]
+** ldr z12, \[sp, #6, mul vl\]
+** ldr z13, \[sp, #7, mul vl\]
+** ldr z14, \[sp, #8, mul vl\]
+** ldr z15, \[sp, #9, mul vl\]
+** ldr z16, \[sp, #10, mul vl\]
+** ldr z17, \[sp, #11, mul vl\]
+** ldr z18, \[sp, #12, mul vl\]
+** ldr z19, \[sp, #13, mul vl\]
+** ldr z20, \[sp, #14, mul vl\]
+** ldr z21, \[sp, #15, mul vl\]
+** ldr z22, \[sp, #16, mul vl\]
+** ldr z23, \[sp, #17, mul vl\]
+** ldr p4, \[sp\]
+** ldr p5, \[sp, #1, mul vl\]
+** ldr p6, \[sp, #2, mul vl\]
+** ldr p7, \[sp, #3, mul vl\]
+** ldr p8, \[sp, #4, mul vl\]
+** ldr p9, \[sp, #5, mul vl\]
+** ldr p10, \[sp, #6, mul vl\]
+** ldr p11, \[sp, #7, mul vl\]
+** ldr p12, \[sp, #8, mul vl\]
+** ldr p13, \[sp, #9, mul vl\]
+** ldr p14, \[sp, #10, mul vl\]
+** ldr p15, \[sp, #11, mul vl\]
+** addvl sp, sp, #18
+** ldp x29, x30, \[sp\], #?32
+** ret
+*/
+svbool_t __attribute__((arm_streaming_compatible))
+sc_caller ()
+{
+ ns_callee ();
+ s_callee ();
+ return sc_callee ();
+}
new file mode 100644
@@ -0,0 +1,45 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+
+#include <arm_sve.h>
+
+svbool_t ns_callee ();
+__attribute__((arm_streaming)) svbool_t s_callee ();
+__attribute__((arm_streaming_compatible)) svbool_t sc_callee ();
+
+struct callbacks {
+ svbool_t (*ns_ptr) ();
+ __attribute__((arm_streaming)) svbool_t (*s_ptr) ();
+ __attribute__((arm_streaming_compatible)) svbool_t (*sc_ptr) ();
+};
+
+svbool_t
+n_caller (struct callbacks *c)
+{
+ ns_callee ();
+ sc_callee ();
+
+ c->ns_ptr ();
+ return c->sc_ptr ();
+}
+
+svbool_t __attribute__((arm_streaming))
+s_caller (struct callbacks *c)
+{
+ s_callee ();
+ sc_callee ();
+
+ c->s_ptr ();
+ return c->sc_ptr ();
+}
+
+svbool_t __attribute__((arm_streaming_compatible))
+sc_caller (struct callbacks *c)
+{
+ sc_callee ();
+
+ return c->sc_ptr ();
+}
+
+// { dg-final { scan-assembler-not {[dpqz][0-9]+,} } }
+// { dg-final { scan-assembler-not {smstart\tsm} } }
+// { dg-final { scan-assembler-not {smstop\tsm} } }
new file mode 100644
@@ -0,0 +1,516 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_neon.h>
+#include <arm_sve.h>
+
+double produce_d0 ();
+void consume_d0 (double);
+
+/*
+** test_d0:
+** ...
+** smstop sm
+** bl produce_d0
+** fmov x10, d0
+** smstart sm
+** fmov d0, x10
+** fmov x10, d0
+** smstop sm
+** fmov d0, x10
+** bl consume_d0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_d0 ()
+{
+ double res = produce_d0 ();
+ asm volatile ("");
+ consume_d0 (res);
+}
+
+int8x8_t produce_d0_vec ();
+void consume_d0_vec (int8x8_t);
+
+/*
+** test_d0_vec:
+** ...
+** smstop sm
+** bl produce_d0_vec
+** (
+** fmov x10, d0
+** |
+** umov x10, v0.d\[0\]
+** )
+** smstart sm
+** fmov d0, x10
+** (
+** fmov x10, d0
+** |
+** umov x10, v0.d\[0\]
+** )
+** smstop sm
+** fmov d0, x10
+** bl consume_d0_vec
+** ...
+*/
+void __attribute__((arm_streaming))
+test_d0_vec ()
+{
+ int8x8_t res = produce_d0_vec ();
+ asm volatile ("");
+ consume_d0_vec (res);
+}
+
+int8x16_t produce_q0 ();
+void consume_q0 (int8x16_t);
+
+/*
+** test_q0:
+** ...
+** smstop sm
+** bl produce_q0
+** str q0, \[sp, #?-16\]!
+** smstart sm
+** ldr q0, \[sp\], #?16
+** str q0, \[sp, #?-16\]!
+** smstop sm
+** ldr q0, \[sp\], #?16
+** bl consume_q0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_q0 ()
+{
+ int8x16_t res = produce_q0 ();
+ asm volatile ("");
+ consume_q0 (res);
+}
+
+int8x16x2_t produce_q1 ();
+void consume_q1 (int8x16x2_t);
+
+/*
+** test_q1:
+** ...
+** smstop sm
+** bl produce_q1
+** stp q0, q1, \[sp, #?-32\]!
+** smstart sm
+** ldp q0, q1, \[sp\], #?32
+** stp q0, q1, \[sp, #?-32\]!
+** smstop sm
+** ldp q0, q1, \[sp\], #?32
+** bl consume_q1
+** ...
+*/
+void __attribute__((arm_streaming))
+test_q1 ()
+{
+ int8x16x2_t res = produce_q1 ();
+ asm volatile ("");
+ consume_q1 (res);
+}
+
+int8x16x3_t produce_q2 ();
+void consume_q2 (int8x16x3_t);
+
+/*
+** test_q2:
+** ...
+** smstop sm
+** bl produce_q2
+** stp q0, q1, \[sp, #?-48\]!
+** str q2, \[sp, #?32\]
+** smstart sm
+** ldr q2, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?48
+** stp q0, q1, \[sp, #?-48\]!
+** str q2, \[sp, #?32\]
+** smstop sm
+** ldr q2, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?48
+** bl consume_q2
+** ...
+*/
+void __attribute__((arm_streaming))
+test_q2 ()
+{
+ int8x16x3_t res = produce_q2 ();
+ asm volatile ("");
+ consume_q2 (res);
+}
+
+int8x16x4_t produce_q3 ();
+void consume_q3 (int8x16x4_t);
+
+/*
+** test_q3:
+** ...
+** smstop sm
+** bl produce_q3
+** stp q0, q1, \[sp, #?-64\]!
+** stp q2, q3, \[sp, #?32\]
+** smstart sm
+** ldp q2, q3, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?64
+** stp q0, q1, \[sp, #?-64\]!
+** stp q2, q3, \[sp, #?32\]
+** smstop sm
+** ldp q2, q3, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?64
+** bl consume_q3
+** ...
+*/
+void __attribute__((arm_streaming))
+test_q3 ()
+{
+ int8x16x4_t res = produce_q3 ();
+ asm volatile ("");
+ consume_q3 (res);
+}
+
+svint8_t produce_z0 ();
+void consume_z0 (svint8_t);
+
+/*
+** test_z0:
+** ...
+** smstop sm
+** bl produce_z0
+** addvl sp, sp, #-1
+** str z0, \[sp\]
+** smstart sm
+** ldr z0, \[sp\]
+** addvl sp, sp, #1
+** addvl sp, sp, #-1
+** str z0, \[sp\]
+** smstop sm
+** ldr z0, \[sp\]
+** addvl sp, sp, #1
+** bl consume_z0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z0 ()
+{
+ svint8_t res = produce_z0 ();
+ asm volatile ("");
+ consume_z0 (res);
+}
+
+svint8x4_t produce_z3 ();
+void consume_z3 (svint8x4_t);
+
+/*
+** test_z3:
+** ...
+** smstop sm
+** bl produce_z3
+** addvl sp, sp, #-4
+** str z0, \[sp\]
+** str z1, \[sp, #1, mul vl\]
+** str z2, \[sp, #2, mul vl\]
+** str z3, \[sp, #3, mul vl\]
+** smstart sm
+** ldr z0, \[sp\]
+** ldr z1, \[sp, #1, mul vl\]
+** ldr z2, \[sp, #2, mul vl\]
+** ldr z3, \[sp, #3, mul vl\]
+** addvl sp, sp, #4
+** addvl sp, sp, #-4
+** str z0, \[sp\]
+** str z1, \[sp, #1, mul vl\]
+** str z2, \[sp, #2, mul vl\]
+** str z3, \[sp, #3, mul vl\]
+** smstop sm
+** ldr z0, \[sp\]
+** ldr z1, \[sp, #1, mul vl\]
+** ldr z2, \[sp, #2, mul vl\]
+** ldr z3, \[sp, #3, mul vl\]
+** addvl sp, sp, #4
+** bl consume_z3
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z3 ()
+{
+ svint8x4_t res = produce_z3 ();
+ asm volatile ("");
+ consume_z3 (res);
+}
+
+svbool_t produce_p0 ();
+void consume_p0 (svbool_t);
+
+/*
+** test_p0:
+** ...
+** smstop sm
+** bl produce_p0
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** addvl sp, sp, #1
+** bl consume_p0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_p0 ()
+{
+ svbool_t res = produce_p0 ();
+ asm volatile ("");
+ consume_p0 (res);
+}
+
+void consume_d7 (double, double, double, double, double, double, double,
+ double);
+
+/*
+** test_d7:
+** ...
+** fmov x10, d0
+** fmov x11, d1
+** fmov x12, d2
+** fmov x13, d3
+** fmov x14, d4
+** fmov x15, d5
+** fmov x16, d6
+** fmov x17, d7
+** smstop sm
+** fmov d0, x10
+** fmov d1, x11
+** fmov d2, x12
+** fmov d3, x13
+** fmov d4, x14
+** fmov d5, x15
+** fmov d6, x16
+** fmov d7, x17
+** bl consume_d7
+** ...
+*/
+void __attribute__((arm_streaming))
+test_d7 ()
+{
+ consume_d7 (1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
+}
+
+void consume_d7_vec (int8x8_t, int8x8_t, int8x8_t, int8x8_t, int8x8_t,
+ int8x8_t, int8x8_t, int8x8_t);
+
+/*
+** test_d7_vec:
+** ...
+** (
+** fmov x10, d0
+** fmov x11, d1
+** fmov x12, d2
+** fmov x13, d3
+** fmov x14, d4
+** fmov x15, d5
+** fmov x16, d6
+** fmov x17, d7
+** |
+** umov x10, v0.d\[0\]
+** umov x11, v1.d\[0\]
+** umov x12, v2.d\[0\]
+** umov x13, v3.d\[0\]
+** umov x14, v4.d\[0\]
+** umov x15, v5.d\[0\]
+** umov x16, v6.d\[0\]
+** umov x17, v7.d\[0\]
+** )
+** smstop sm
+** fmov d0, x10
+** fmov d1, x11
+** fmov d2, x12
+** fmov d3, x13
+** fmov d4, x14
+** fmov d5, x15
+** fmov d6, x16
+** fmov d7, x17
+** bl consume_d7_vec
+** ...
+*/
+void __attribute__((arm_streaming))
+test_d7_vec (int8x8_t *ptr)
+{
+ consume_d7_vec (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr);
+}
+
+void consume_q7 (int8x16_t, int8x16_t, int8x16_t, int8x16_t, int8x16_t,
+ int8x16_t, int8x16_t, int8x16_t);
+
+/*
+** test_q7:
+** ...
+** stp q0, q1, \[sp, #?-128\]!
+** stp q2, q3, \[sp, #?32\]
+** stp q4, q5, \[sp, #?64\]
+** stp q6, q7, \[sp, #?96\]
+** smstop sm
+** ldp q2, q3, \[sp, #?32\]
+** ldp q4, q5, \[sp, #?64\]
+** ldp q6, q7, \[sp, #?96\]
+** ldp q0, q1, \[sp\], #?128
+** bl consume_q7
+** ...
+*/
+void __attribute__((arm_streaming))
+test_q7 (int8x16_t *ptr)
+{
+ consume_q7 (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr);
+}
+
+void consume_z7 (svint8_t, svint8_t, svint8_t, svint8_t, svint8_t,
+ svint8_t, svint8_t, svint8_t);
+
+/*
+** test_z7:
+** ...
+** addvl sp, sp, #-8
+** str z0, \[sp\]
+** str z1, \[sp, #1, mul vl\]
+** str z2, \[sp, #2, mul vl\]
+** str z3, \[sp, #3, mul vl\]
+** str z4, \[sp, #4, mul vl\]
+** str z5, \[sp, #5, mul vl\]
+** str z6, \[sp, #6, mul vl\]
+** str z7, \[sp, #7, mul vl\]
+** smstop sm
+** ldr z0, \[sp\]
+** ldr z1, \[sp, #1, mul vl\]
+** ldr z2, \[sp, #2, mul vl\]
+** ldr z3, \[sp, #3, mul vl\]
+** ldr z4, \[sp, #4, mul vl\]
+** ldr z5, \[sp, #5, mul vl\]
+** ldr z6, \[sp, #6, mul vl\]
+** ldr z7, \[sp, #7, mul vl\]
+** addvl sp, sp, #8
+** bl consume_z7
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z7 (svint8_t *ptr)
+{
+ consume_z7 (*ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr, *ptr);
+}
+
+void consume_p3 (svbool_t, svbool_t, svbool_t, svbool_t);
+
+/*
+** test_p3:
+** ...
+** addvl sp, sp, #-1
+** str p0, \[sp\]
+** str p1, \[sp, #1, mul vl\]
+** str p2, \[sp, #2, mul vl\]
+** str p3, \[sp, #3, mul vl\]
+** smstop sm
+** ldr p0, \[sp\]
+** ldr p1, \[sp, #1, mul vl\]
+** ldr p2, \[sp, #2, mul vl\]
+** ldr p3, \[sp, #3, mul vl\]
+** addvl sp, sp, #1
+** bl consume_p3
+** ...
+*/
+void __attribute__((arm_streaming))
+test_p3 (svbool_t *ptr)
+{
+ consume_p3 (*ptr, *ptr, *ptr, *ptr);
+}
+
+void consume_mixed (float, double, float32x4_t, svfloat32_t,
+ float, double, float64x2_t, svfloat64_t,
+ svbool_t, svbool_t, svbool_t, svbool_t);
+
+/*
+** test_mixed:
+** ...
+** addvl sp, sp, #-3
+** str p0, \[sp\]
+** str p1, \[sp, #1, mul vl\]
+** str p2, \[sp, #2, mul vl\]
+** str p3, \[sp, #3, mul vl\]
+** str z3, \[sp, #1, mul vl\]
+** str z7, \[sp, #2, mul vl\]
+** stp q2, q6, \[sp, #?-32\]!
+** fmov w10, s0
+** fmov x11, d1
+** fmov w12, s4
+** fmov x13, d5
+** smstop sm
+** fmov s0, w10
+** fmov d1, x11
+** fmov s4, w12
+** fmov d5, x13
+** ldp q2, q6, \[sp\], #?32
+** ldr p0, \[sp\]
+** ldr p1, \[sp, #1, mul vl\]
+** ldr p2, \[sp, #2, mul vl\]
+** ldr p3, \[sp, #3, mul vl\]
+** ldr z3, \[sp, #1, mul vl\]
+** ldr z7, \[sp, #2, mul vl\]
+** addvl sp, sp, #3
+** bl consume_mixed
+** ...
+*/
+void __attribute__((arm_streaming))
+test_mixed (float32x4_t *float32x4_ptr,
+ svfloat32_t *svfloat32_ptr,
+ float64x2_t *float64x2_ptr,
+ svfloat64_t *svfloat64_ptr,
+ svbool_t *svbool_ptr)
+{
+ consume_mixed (1.0f, 2.0, *float32x4_ptr, *svfloat32_ptr,
+ 3.0f, 4.0, *float64x2_ptr, *svfloat64_ptr,
+ *svbool_ptr, *svbool_ptr, *svbool_ptr, *svbool_ptr);
+}
+
+void consume_varargs (float, ...);
+
+/*
+** test_varargs:
+** ...
+** stp q3, q7, \[sp, #?-32\]!
+** fmov w10, s0
+** fmov x11, d1
+** (
+** fmov x12, d2
+** |
+** umov x12, v2.d\[0\]
+** )
+** fmov x13, d4
+** fmov x14, d5
+** (
+** fmov x15, d6
+** |
+** umov x15, v6.d\[0\]
+** )
+** smstop sm
+** fmov s0, w10
+** fmov d1, x11
+** fmov d2, x12
+** fmov d4, x13
+** fmov d5, x14
+** fmov d6, x15
+** ldp q3, q7, \[sp\], #?32
+** bl consume_varargs
+** ...
+*/
+void __attribute__((arm_streaming))
+test_varargs (float32x2_t *float32x2_ptr,
+ float32x4_t *float32x4_ptr,
+ float64x1_t *float64x1_ptr,
+ float64x2_t *float64x2_ptr)
+{
+ consume_varargs (1.0f, 2.0, *float32x2_ptr, *float32x4_ptr,
+ 3.0f, 4.0, *float64x1_ptr, *float64x2_ptr);
+}
new file mode 100644
@@ -0,0 +1,87 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -msve-vector-bits=128" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sve.h>
+
+svint8_t produce_z0 ();
+void consume_z0 (svint8_t);
+
+/*
+** test_z0:
+** ...
+** smstop sm
+** bl produce_z0
+** str q0, \[sp, #?-16\]!
+** smstart sm
+** ldr q0, \[sp\], #?16
+** str q0, \[sp, #?-16\]!
+** smstop sm
+** ldr q0, \[sp\], #?16
+** bl consume_z0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z0 ()
+{
+ svint8_t res = produce_z0 ();
+ asm volatile ("");
+ consume_z0 (res);
+}
+
+svint8x4_t produce_z3 ();
+void consume_z3 (svint8x4_t);
+
+/*
+** test_z3:
+** ...
+** smstop sm
+** bl produce_z3
+** stp q0, q1, \[sp, #?-64\]!
+** stp q2, q3, \[sp, #?32\]
+** smstart sm
+** ldp q2, q3, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?64
+** stp q0, q1, \[sp, #?-64\]!
+** stp q2, q3, \[sp, #?32\]
+** smstop sm
+** ldp q2, q3, \[sp, #?32\]
+** ldp q0, q1, \[sp\], #?64
+** bl consume_z3
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z3 ()
+{
+ svint8x4_t res = produce_z3 ();
+ asm volatile ("");
+ consume_z3 (res);
+}
+
+svbool_t produce_p0 ();
+void consume_p0 (svbool_t);
+
+/*
+** test_p0:
+** ...
+** smstop sm
+** bl produce_p0
+** sub sp, sp, #?16
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** add sp, sp, #?16
+** sub sp, sp, #?16
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** add sp, sp, #?16
+** bl consume_p0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_p0 ()
+{
+ svbool_t res = produce_p0 ();
+ asm volatile ("");
+ consume_p0 (res);
+}
new file mode 100644
@@ -0,0 +1,103 @@
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls -msve-vector-bits=256" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sve.h>
+
+svint8_t produce_z0 ();
+void consume_z0 (svint8_t);
+
+/*
+** test_z0:
+** ...
+** smstop sm
+** bl produce_z0
+** sub sp, sp, #?32
+** str z0, \[sp\]
+** smstart sm
+** ldr z0, \[sp\]
+** add sp, sp, #?32
+** sub sp, sp, #?32
+** str z0, \[sp\]
+** smstop sm
+** ldr z0, \[sp\]
+** add sp, sp, #?32
+** bl consume_z0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z0 ()
+{
+ svint8_t res = produce_z0 ();
+ asm volatile ("");
+ consume_z0 (res);
+}
+
+svint8x4_t produce_z3 ();
+void consume_z3 (svint8x4_t);
+
+/*
+** test_z3:
+** ...
+** smstop sm
+** bl produce_z3
+** sub sp, sp, #?128
+** str z0, \[sp\]
+** str z1, \[sp, #1, mul vl\]
+** str z2, \[sp, #2, mul vl\]
+** str z3, \[sp, #3, mul vl\]
+** smstart sm
+** ldr z0, \[sp\]
+** ldr z1, \[sp, #1, mul vl\]
+** ldr z2, \[sp, #2, mul vl\]
+** ldr z3, \[sp, #3, mul vl\]
+** add sp, sp, #?128
+** sub sp, sp, #?128
+** str z0, \[sp\]
+** str z1, \[sp, #1, mul vl\]
+** str z2, \[sp, #2, mul vl\]
+** str z3, \[sp, #3, mul vl\]
+** smstop sm
+** ldr z0, \[sp\]
+** ldr z1, \[sp, #1, mul vl\]
+** ldr z2, \[sp, #2, mul vl\]
+** ldr z3, \[sp, #3, mul vl\]
+** add sp, sp, #?128
+** bl consume_z3
+** ...
+*/
+void __attribute__((arm_streaming))
+test_z3 ()
+{
+ svint8x4_t res = produce_z3 ();
+ asm volatile ("");
+ consume_z3 (res);
+}
+
+svbool_t produce_p0 ();
+void consume_p0 (svbool_t);
+
+/*
+** test_p0:
+** ...
+** smstop sm
+** bl produce_p0
+** sub sp, sp, #?32
+** str p0, \[sp\]
+** smstart sm
+** ldr p0, \[sp\]
+** add sp, sp, #?32
+** sub sp, sp, #?32
+** str p0, \[sp\]
+** smstop sm
+** ldr p0, \[sp\]
+** add sp, sp, #?32
+** bl consume_p0
+** ...
+*/
+void __attribute__((arm_streaming))
+test_p0 ()
+{
+ svbool_t res = produce_p0 ();
+ asm volatile ("");
+ consume_p0 (res);
+}