Message ID | 1537456422.24844.12.camel@cavium.com |
---|---|
State | New |
Headers | show |
Series | [1/3,Aarch64] Implement Aarch64 SIMD ABI | expand |
Steve Ellcey <sellcey@cavium.com> writes: > @@ -1005,6 +1005,15 @@ static const struct processor *selected_tune; > /* The current tuning set. */ > struct tune_params aarch64_tune_params = generic_tunings; > > +/* Table of machine attributes. */ > +static const struct attribute_spec aarch64_attribute_table[] = > +{ > + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, > + affects_type_identity, handler, exclude } */ > + { "aarch64_vector_pcs", 0, 0, true, false, false, false, NULL, NULL }, > + { NULL, 0, 0, false, false, false, false, NULL, NULL } > +}; Maybe it would be better to make this a type attribute instead, so that it's possible to create pointers to PCS functions without losing the ABI information. > @@ -1383,6 +1392,31 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode) > return false; > } > > +/* Return true if this is a definition of a vectorized simd function. */ > + > +static bool > +aarch64_simd_decl_p (tree fndecl) > +{ > + if (lookup_attribute ("aarch64_vector_pcs", DECL_ATTRIBUTES (fndecl)) != NULL) > + return true; > + if (lookup_attribute ("simd", DECL_ATTRIBUTES (fndecl)) == NULL) > + return false; > + return (VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (fndecl)))); Why's only the return type relevant here? Think this deserves a comment. > @@ -3181,7 +3215,9 @@ static bool > aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, > tree exp ATTRIBUTE_UNUSED) > { > - /* Currently, always true. */ > + if (aarch64_simd_decl_p (cfun->decl)) > + return false; This should be OK if the target is also a vector PCS function. > @@ -4012,7 +4061,8 @@ aarch64_layout_frame (void) > { > /* If there is an alignment gap between integer and fp callee-saves, > allocate the last fp register to it if possible. */ > - if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0) > + if (regno == last_fp_reg && has_align_gap > + && !simd_function && (offset & 8) == 0) > { > cfun->machine->frame.reg_offset[regno] = max_int_offset; > break; Nit: one condition per line once the whole thing no longer fits on a line. > @@ -4024,7 +4074,7 @@ aarch64_layout_frame (void) > else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM > && cfun->machine->frame.wb_candidate1 >= V0_REGNUM) > cfun->machine->frame.wb_candidate2 = regno; > - offset += UNITS_PER_WORD; > + offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD; > } > > offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); > @@ -4167,6 +4217,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, > return gen_storewb_pairdf_di (base, base, reg, reg2, > GEN_INT (-adjustment), > GEN_INT (UNITS_PER_WORD - adjustment)); > + case E_TFmode: > + return gen_storewb_pairtf_di (base, base, reg, reg2, > + GEN_INT (-adjustment), > + GEN_INT (UNITS_PER_VREG - adjustment)); > default: > gcc_unreachable (); > } > @@ -4179,7 +4233,7 @@ static void > aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment) > { > rtx_insn *insn; > - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; > + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); > > if (regno2 == INVALID_REGNUM) > return aarch64_pushwb_single_reg (mode, regno1, adjustment); > @@ -4209,6 +4263,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, > case E_DFmode: > return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment), > GEN_INT (UNITS_PER_WORD)); > + case E_TFmode: > + return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment), > + GEN_INT (UNITS_PER_VREG)); > default: > gcc_unreachable (); > } > @@ -4222,7 +4279,7 @@ static void > aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment, > rtx *cfi_ops) > { > - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; > + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); > rtx reg1 = gen_rtx_REG (mode, regno1); > > *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); > @@ -4257,6 +4314,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, > case E_DFmode: > return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2); > > + case E_TFmode: > + return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); > + > default: > gcc_unreachable (); > } > @@ -4277,6 +4337,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, > case E_DFmode: > return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2); > > + case E_TFmode: > + return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); > + > default: > gcc_unreachable (); > } > @@ -4309,6 +4372,10 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset, > rtx_insn *insn; > unsigned regno; > unsigned regno2; > + HOST_WIDE_INT mode_size; > + > + if (!GET_MODE_SIZE (mode).is_constant(&mode_size)) > + gcc_unreachable (); Just make this poly_int64 and use known_eq instead of ==... > @@ -4334,7 +4401,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset, > > if (regno2 <= limit > && !cfun->machine->reg_is_wrapped_separately[regno2] > - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) > + && ((cfun->machine->frame.reg_offset[regno] + mode_size) > == cfun->machine->frame.reg_offset[regno2])) > > { ...here. That avoids having to justify why the size is known to be constant. > @@ -4375,6 +4442,9 @@ aarch64_restore_callee_saves (machine_mode mode, > unsigned regno; > unsigned regno2; > poly_int64 offset; > + HOST_WIDE_INT mode_size; > + > + gcc_assert (GET_MODE_SIZE (mode).is_constant(&mode_size)); > > for (regno = aarch64_next_callee_save (start, limit); > regno <= limit; > @@ -4398,7 +4468,7 @@ aarch64_restore_callee_saves (machine_mode mode, > > if (regno2 <= limit > && !cfun->machine->reg_is_wrapped_separately[regno2] > - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) > + && ((cfun->machine->frame.reg_offset[regno] + mode_size) > == cfun->machine->frame.reg_offset[regno2])) > { > rtx reg2 = gen_rtx_REG (mode, regno2); Same here. (gcc_assert (...) calls with necessary side effects don't work when the compiler is built with --enable-checking=no.) > @@ -4611,8 +4683,10 @@ aarch64_process_components (sbitmap components, bool prologue_p) > while (regno != last_regno) > { > /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved > - so DFmode for the vector registers is enough. */ > - machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode; > + so DFmode for the vector registers is enough. For simd functions > + we want to save the entire register. */ Would be good to fix the indentation of the second line while you're there. Maybe s/the entire register/the low 128 bits/ since it's possible to have Advanced SIMD vector PCS functions when compiling for SVE. > @@ -4712,6 +4787,25 @@ aarch64_set_handled_components (sbitmap components) > cfun->machine->reg_is_wrapped_separately[regno] = true; > } > > +/* Return 1 if the register is used by the epilogue. We need to say the > + return register is used, but only after epilogue generation is complete. > + Note that in the case of sibcalls, the values "used by the epilogue" are > + considered live at the start of the called function. > + > + For SIMD functions we need to return 1 for FP registers that are saved and > + restored by a function but not zero in call_used_regs. If we do not do > + this optimizations may remove the restore of the register. */ > + > +int > +aarch64_epilogue_uses (int regno) > +{ > + if (epilogue_completed && regno == LR_REGNUM) > + return 1; > + if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno)) > + return 1; > + return 0; Shouldn't this also depend on epilogue completed? We won't have saved the register otherwise, and it could cause the register to be marked live unnecessarily. If we check epilogue_completed, we can also check whether the function actually saves the register, which would make things even more precise. > @@ -4884,6 +4982,19 @@ aarch64_use_return_insn_p (void) > return known_eq (cfun->machine->frame.frame_size, 0); > } > > +/* Return false for non-leaf SIMD functions in order to avoid > + shrink-wrapping them. Doing this will lose the necessary > + save/restore of FP registers. */ > + > +bool > +aarch64_use_simple_return_insn_p (void) > +{ > + if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf) > + return false; > + > + return true; > +} How hard would it be to avoid this? Shrink-wrapping could be a very useful optimisation for vector routines, if the routine has to use calls to other functions to handle rare but difficult cases. > @@ -6185,7 +6300,7 @@ aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) > void > aarch64_expand_call (rtx result, rtx mem, bool sibcall) > { > - rtx call, callee, tmp; > + rtx call, call_insn, callee, tmp; > rtvec vec; > machine_mode mode; > > @@ -6203,7 +6318,8 @@ aarch64_expand_call (rtx result, rtx mem, bool sibcall) > : !REG_P (callee)) > XEXP (mem, 0) = force_reg (mode, callee); > > - call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); > + call_insn = gen_rtx_CALL (VOIDmode, mem, const0_rtx); > + call = call_insn; "call_insn" is a bit misleading, since it's not a full insn. Maybe: call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); pat = call; or something instead? LGTM otherwise, but I'll leave the AArch64 maintainers to do the main review. Thanks, Richard
> LGTM otherwise, but I'll leave the AArch64 maintainers to do the > main review. > > Thanks, > Richard Here is an updated version that addresses the issues you raised. The only thing I did not try to do is to change aarch64_use_simple_return_insn_p, so shrink wrapping of SIMD functions will still not happen. I think that may be a good thing, but I am not sure of everything involved in getting that to work so I skipped it for now while I work on the rest of the simd ABI changes like the ones in patches 2 and 3. I also added a new test to make sure that the code in aarch64_epilogue_uses was not saving extraneous registers. While that function returns true for all vector registers that a SIMD function must save, only registers that are used/clobbered in the function will be saved. That is now tested with the simd-abi-5.c test. Steve Ellcey sellcey@cavium.com 2018-10-10 Steve Ellcey <sellcey@cavium.com> * config/aarch64/aarch64-protos.h (aarch64_use_simple_return_insn_p): New prototype. (aarch64_epilogue_uses): Ditto. * config/aarch64/aarch64.c (aarch64_attribute_table): New array. (aarch64_simd_decl_p): New function. (aarch64_reg_save_mode): New function. (aarch64_is_simd_call_p): New function. (aarch64_function_ok_for_sibcall): Check for simd calls. (aarch64_layout_frame): Check for simd function. (aarch64_gen_storewb_pair): Handle E_TFmode. (aarch64_push_regs): Use aarch64_reg_save_mode to get mode. (aarch64_gen_loadwb_pair): Handle E_TFmode. (aarch64_pop_regs): Use aarch64_reg_save_mode to get mode. (aarch64_gen_store_pair): Handle E_TFmode. (aarch64_gen_load_pair): Ditto. (aarch64_save_callee_saves): Handle different mode sizes. (aarch64_restore_callee_saves): Ditto. (aarch64_components_for_bb): Check for simd function. (aarch64_epilogue_uses): New function. (aarch64_process_components): Check for simd function. (aarch64_expand_prologue): Ditto. (aarch64_expand_epilogue): Ditto. (aarch64_expand_call): Ditto. (aarch64_use_simple_return_insn_p): New function. (TARGET_ATTRIBUTE_TABLE): New define. * config/aarch64/aarch64.h (EPILOGUE_USES): Redefine. (FP_SIMD_SAVED_REGNUM_P): New macro. * config/aarch64/aarch64.md (simple_return): New define_expand. (load_pair_dw_tftf): New instruction. (store_pair_dw_tftf): Ditto. (loadwb_pair<TX:mode>_<P:mode>): Ditto. ("storewb_pair<TX:mode>_<P:mode>): Ditto. 2018-10-10 Steve Ellcey <sellcey@cavium.com> * gcc.target/aarch64/torture/aarch64-torture.exp: New file. * gcc.target/aarch64/torture/simd-abi-1.c: New test. * gcc.target/aarch64/torture/simd-abi-2.c: Ditto. * gcc.target/aarch64/torture/simd-abi-3.c: Ditto. * gcc.target/aarch64/torture/simd-abi-4.c: Ditto. * gcc.target/aarch64/torture/simd-abi-5.c: Ditto. diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 5f18837..0f54adf 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -467,6 +467,7 @@ bool aarch64_split_dimode_const_store (rtx, rtx); bool aarch64_symbolic_address_p (rtx); bool aarch64_uimm12_shift (HOST_WIDE_INT); bool aarch64_use_return_insn_p (void); +bool aarch64_use_simple_return_insn_p (void); const char *aarch64_mangle_builtin_type (const_tree); const char *aarch64_output_casesi (rtx *); @@ -552,6 +553,8 @@ void aarch64_split_simd_move (rtx, rtx); /* Check for a legitimate floating point constant for FMOV. */ bool aarch64_float_const_representable_p (rtx); +extern int aarch64_epilogue_uses (int); + #if defined (RTX_CODE) void aarch64_gen_unlikely_cbranch (enum rtx_code, machine_mode cc_mode, rtx label_ref); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index d385b24..2d4e47b 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -1088,6 +1088,15 @@ static const struct processor *selected_tune; /* The current tuning set. */ struct tune_params aarch64_tune_params = generic_tunings; +/* Table of machine attributes. */ +static const struct attribute_spec aarch64_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, + affects_type_identity, handler, exclude } */ + { "aarch64_vector_pcs", 0, 0, false, true, true, false, NULL, NULL }, + { NULL, 0, 0, false, false, false, false, NULL, NULL } +}; + #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0) /* An ISA extension in the co-processor and main instruction set space. */ @@ -1466,6 +1475,45 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode) return false; } +/* Return true if this is a definition of a vectorized simd function. */ + +static bool +aarch64_simd_decl_p (tree fndecl) +{ + tree fntype; + + if (fndecl == NULL) + return false; + fntype = TREE_TYPE (fndecl); + if (fntype == NULL) + return false; + + /* All functions with the aarch64_vector_pcs attribute use the simd ABI. */ + if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL) + return true; + /* Functions without the aarch64_vector_pcs or simd attribute never use the + simd ABI. */ + if (lookup_attribute ("simd", TYPE_ATTRIBUTES (fntype)) == NULL) + return false; + /* Functions with the simd attribute can generate three versions of a + function, a masked vector function, an unmasked vector function, + and a scalar version. Only the vector versions use the simd ABI. */ + return (VECTOR_TYPE_P (TREE_TYPE (fntype))); +} + +/* Return the mode a register save/restore should use. DImode for integer + registers, DFmode for FP registers in non-SIMD functions (they only save + the bottom half of a 128 bit register), or TFmode for FP registers in + SIMD functions. */ + +static machine_mode +aarch64_reg_save_mode (tree fndecl, unsigned regno) +{ + return GP_REGNUM_P (regno) + ? E_DImode + : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode); +} + /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves the lower 64 bits of a 128-bit register. Tell the compiler the callee clobbers the top 64 bits when restoring the bottom 64 bits. */ @@ -3265,7 +3313,9 @@ static bool aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, tree exp ATTRIBUTE_UNUSED) { - /* Currently, always true. */ + if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl)) + return false; + return true; } @@ -4126,6 +4176,7 @@ aarch64_layout_frame (void) { HOST_WIDE_INT offset = 0; int regno, last_fp_reg = INVALID_REGNUM; + bool simd_function = aarch64_simd_decl_p (cfun->decl); cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain (); @@ -4139,6 +4190,17 @@ aarch64_layout_frame (void) cfun->machine->frame.wb_candidate1 = INVALID_REGNUM; cfun->machine->frame.wb_candidate2 = INVALID_REGNUM; + /* If this is a non-leaf simd function with calls we assume that + at least one of those calls is to a non-simd function and thus + we must save V8 to V23 in the prologue. */ + + if (simd_function && !crtl->is_leaf) + { + for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) + if (FP_SIMD_SAVED_REGNUM_P (regno)) + df_set_regs_ever_live (regno, true); + } + /* First mark all the registers that really need to be saved... */ for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED; @@ -4161,7 +4223,8 @@ aarch64_layout_frame (void) for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) if (df_regs_ever_live_p (regno) - && !call_used_regs[regno]) + && (!call_used_regs[regno] + || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))) { cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED; last_fp_reg = regno; @@ -4203,7 +4266,10 @@ aarch64_layout_frame (void) { /* If there is an alignment gap between integer and fp callee-saves, allocate the last fp register to it if possible. */ - if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0) + if (regno == last_fp_reg + && has_align_gap + && !simd_function + && (offset & 8) == 0) { cfun->machine->frame.reg_offset[regno] = max_int_offset; break; @@ -4215,7 +4281,7 @@ aarch64_layout_frame (void) else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM && cfun->machine->frame.wb_candidate1 >= V0_REGNUM) cfun->machine->frame.wb_candidate2 = regno; - offset += UNITS_PER_WORD; + offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD; } offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); @@ -4358,6 +4424,10 @@ aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, return gen_storewb_pairdf_di (base, base, reg, reg2, GEN_INT (-adjustment), GEN_INT (UNITS_PER_WORD - adjustment)); + case E_TFmode: + return gen_storewb_pairtf_di (base, base, reg, reg2, + GEN_INT (-adjustment), + GEN_INT (UNITS_PER_VREG - adjustment)); default: gcc_unreachable (); } @@ -4370,7 +4440,7 @@ static void aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment) { rtx_insn *insn; - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); if (regno2 == INVALID_REGNUM) return aarch64_pushwb_single_reg (mode, regno1, adjustment); @@ -4400,6 +4470,9 @@ aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, case E_DFmode: return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment), GEN_INT (UNITS_PER_WORD)); + case E_TFmode: + return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment), + GEN_INT (UNITS_PER_VREG)); default: gcc_unreachable (); } @@ -4413,7 +4486,7 @@ static void aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment, rtx *cfi_ops) { - machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode; + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); rtx reg1 = gen_rtx_REG (mode, regno1); *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); @@ -4448,6 +4521,9 @@ aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, case E_DFmode: return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2); + case E_TFmode: + return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); + default: gcc_unreachable (); } @@ -4468,6 +4544,9 @@ aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, case E_DFmode: return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2); + case E_TFmode: + return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); + default: gcc_unreachable (); } @@ -4507,6 +4586,7 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset, { rtx reg, mem; poly_int64 offset; + int offset_diff; if (skip_wb && (regno == cfun->machine->frame.wb_candidate1 @@ -4522,12 +4602,12 @@ aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset, offset)); regno2 = aarch64_next_callee_save (regno + 1, limit); + offset_diff = cfun->machine->frame.reg_offset[regno2] + - cfun->machine->frame.reg_offset[regno]; if (regno2 <= limit && !cfun->machine->reg_is_wrapped_separately[regno2] - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) - == cfun->machine->frame.reg_offset[regno2])) - + && known_eq (GET_MODE_SIZE (mode), offset_diff)) { rtx reg2 = gen_rtx_REG (mode, regno2); rtx mem2; @@ -4575,6 +4655,7 @@ aarch64_restore_callee_saves (machine_mode mode, continue; rtx reg, mem; + int offset_diff; if (skip_wb && (regno == cfun->machine->frame.wb_candidate1 @@ -4586,11 +4667,12 @@ aarch64_restore_callee_saves (machine_mode mode, mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); regno2 = aarch64_next_callee_save (regno + 1, limit); + offset_diff = cfun->machine->frame.reg_offset[regno2] + - cfun->machine->frame.reg_offset[regno]; if (regno2 <= limit && !cfun->machine->reg_is_wrapped_separately[regno2] - && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD) - == cfun->machine->frame.reg_offset[regno2])) + && known_eq (GET_MODE_SIZE (mode), offset_diff)) { rtx reg2 = gen_rtx_REG (mode, regno2); rtx mem2; @@ -4724,13 +4806,15 @@ aarch64_components_for_bb (basic_block bb) bitmap in = DF_LIVE_IN (bb); bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; + bool simd_function = aarch64_simd_decl_p (cfun->decl); sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); bitmap_clear (components); /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) - if ((!call_used_regs[regno]) + if ((!call_used_regs[regno] + || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))) && (bitmap_bit_p (in, regno) || bitmap_bit_p (gen, regno) || bitmap_bit_p (kill, regno))) @@ -4801,9 +4885,11 @@ aarch64_process_components (sbitmap components, bool prologue_p) while (regno != last_regno) { - /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved - so DFmode for the vector registers is enough. */ - machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode; + /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved + so DFmode for the vector registers is enough. For simd functions + we want to save the low 128 bits. */ + machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno); + rtx reg = gen_rtx_REG (mode, regno); poly_int64 offset = cfun->machine->frame.reg_offset[regno]; if (!frame_pointer_needed) @@ -4832,6 +4918,7 @@ aarch64_process_components (sbitmap components, bool prologue_p) mergeable with the current one into a pair. */ if (!satisfies_constraint_Ump (mem) || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) + || (aarch64_simd_decl_p (cfun->decl) && (FP_REGNUM_P (regno))) || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), GET_MODE_SIZE (mode))) { @@ -5147,6 +5234,28 @@ aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, } } +/* Return 1 if the register is used by the epilogue. We need to say the + return register is used, but only after epilogue generation is complete. + Note that in the case of sibcalls, the values "used by the epilogue" are + considered live at the start of the called function. + + For SIMD functions we need to return 1 for FP registers that are saved and + restored by a function but are not zero in call_used_regs. If we do not do + this optimizations may remove the restore of the register. */ + +int +aarch64_epilogue_uses (int regno) +{ + if (epilogue_completed) + { + if (regno == LR_REGNUM) + return 1; + if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno)) + return 1; + } + return 0; +} + /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG is saved at BASE + OFFSET. */ @@ -5321,8 +5430,12 @@ aarch64_expand_prologue (void) aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, callee_adjust != 0 || emit_frame_chain); - aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, - callee_adjust != 0 || emit_frame_chain); + if (aarch64_simd_decl_p (cfun->decl)) + aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM, + callee_adjust != 0 || emit_frame_chain); + else + aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, + callee_adjust != 0 || emit_frame_chain); /* We may need to probe the final adjustment if it is larger than the guard that is assumed by the called. */ @@ -5348,6 +5461,19 @@ aarch64_use_return_insn_p (void) return known_eq (cfun->machine->frame.frame_size, 0); } +/* Return false for non-leaf SIMD functions in order to avoid + shrink-wrapping them. Doing this will lose the necessary + save/restore of FP registers. */ + +bool +aarch64_use_simple_return_insn_p (void) +{ + if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf) + return false; + + return true; +} + /* Generate the epilogue instructions for returning from a function. This is almost exactly the reverse of the prolog sequence, except that we need to insert barriers to avoid scheduling loads that read @@ -5416,8 +5542,12 @@ aarch64_expand_epilogue (bool for_sibcall) aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, callee_adjust != 0, &cfi_ops); - aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, - callee_adjust != 0, &cfi_ops); + if (aarch64_simd_decl_p (cfun->decl)) + aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM, + callee_adjust != 0, &cfi_ops); + else + aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, + callee_adjust != 0, &cfi_ops); if (need_barrier_p) emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); @@ -18470,6 +18600,9 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_SPECULATION_SAFE_VALUE #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value +#undef TARGET_ATTRIBUTE_TABLE +#define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index fa9af26..c58e057 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -406,13 +406,7 @@ extern unsigned aarch64_architecture_version; V_ALIASES(28), V_ALIASES(29), V_ALIASES(30), V_ALIASES(31) \ } -/* Say that the return address register is used by the epilogue, but only after - epilogue generation is complete. Note that in the case of sibcalls, the - values "used by the epilogue" are considered live at the start of the called - function. */ - -#define EPILOGUE_USES(REGNO) \ - (epilogue_completed && (REGNO) == LR_REGNUM) +#define EPILOGUE_USES(REGNO) (aarch64_epilogue_uses (REGNO)) /* EXIT_IGNORE_STACK should be nonzero if, when returning from a function, the stack pointer does not matter. This is only true if the function @@ -520,6 +514,8 @@ extern unsigned aarch64_architecture_version; #define PR_LO_REGNUM_P(REGNO)\ (((unsigned) (REGNO - P0_REGNUM)) <= (P7_REGNUM - P0_REGNUM)) +#define FP_SIMD_SAVED_REGNUM_P(REGNO) \ + (((unsigned) (REGNO - V8_REGNUM)) <= (V23_REGNUM - V8_REGNUM)) /* Register and constant classes. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index b4a4315..6c11ae0 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -723,7 +723,13 @@ "" ) -(define_insn "simple_return" +(define_expand "simple_return" + [(simple_return)] + "aarch64_use_simple_return_insn_p ()" + "" +) + +(define_insn "*simple_return" [(simple_return)] "" "ret" @@ -1385,6 +1391,21 @@ (set_attr "arch" "*,fp")] ) +(define_insn "load_pair_dw_tftf" + [(set (match_operand:TF 0 "register_operand" "=w") + (match_operand:TF 1 "aarch64_mem_pair_operand" "Ump")) + (set (match_operand:TF 2 "register_operand" "=w") + (match_operand:TF 3 "memory_operand" "m"))] + "TARGET_SIMD + && rtx_equal_p (XEXP (operands[3], 0), + plus_constant (Pmode, + XEXP (operands[1], 0), + GET_MODE_SIZE (TFmode)))" + "ldp\\t%q0, %q2, %1" + [(set_attr "type" "neon_ldp_q") + (set_attr "fp" "yes")] +) + ;; Operands 0 and 2 are tied together by the final condition; so we allow ;; fairly lax checking on the second memory operation. (define_insn "store_pair_sw_<SX:mode><SX2:mode>" @@ -1420,18 +1441,33 @@ (set_attr "arch" "*,fp")] ) +(define_insn "store_pair_dw_tftf" + [(set (match_operand:TF 0 "aarch64_mem_pair_operand" "=Ump") + (match_operand:TF 1 "register_operand" "w")) + (set (match_operand:TF 2 "memory_operand" "=m") + (match_operand:TF 3 "register_operand" "w"))] + "TARGET_SIMD && + rtx_equal_p (XEXP (operands[2], 0), + plus_constant (Pmode, + XEXP (operands[0], 0), + GET_MODE_SIZE (TFmode)))" + "stp\\t%q1, %q3, %0" + [(set_attr "type" "neon_stp_q") + (set_attr "fp" "yes")] +) + ;; Load pair with post-index writeback. This is primarily used in function ;; epilogues. (define_insn "loadwb_pair<GPI:mode>_<P:mode>" [(parallel [(set (match_operand:P 0 "register_operand" "=k") - (plus:P (match_operand:P 1 "register_operand" "0") - (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) + (plus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) (set (match_operand:GPI 2 "register_operand" "=r") - (mem:GPI (match_dup 1))) + (mem:GPI (match_dup 1))) (set (match_operand:GPI 3 "register_operand" "=r") - (mem:GPI (plus:P (match_dup 1) - (match_operand:P 5 "const_int_operand" "n"))))])] + (mem:GPI (plus:P (match_dup 1) + (match_operand:P 5 "const_int_operand" "n"))))])] "INTVAL (operands[5]) == GET_MODE_SIZE (<GPI:MODE>mode)" "ldp\\t%<w>2, %<w>3, [%1], %4" [(set_attr "type" "load_<ldpstp_sz>")] @@ -1452,6 +1488,21 @@ [(set_attr "type" "neon_load1_2reg")] ) +(define_insn "loadwb_pair<TX:mode>_<P:mode>" + [(parallel + [(set (match_operand:P 0 "register_operand" "=k") + (plus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) + (set (match_operand:TX 2 "register_operand" "=w") + (mem:TX (match_dup 1))) + (set (match_operand:TX 3 "register_operand" "=w") + (mem:TX (plus:P (match_dup 1) + (match_operand:P 5 "const_int_operand" "n"))))])] + "TARGET_SIMD && INTVAL (operands[5]) == GET_MODE_SIZE (<TX:MODE>mode)" + "ldp\\t%q2, %q3, [%1], %4" + [(set_attr "type" "neon_ldp_q")] +) + ;; Store pair with pre-index writeback. This is primarily used in function ;; prologues. (define_insn "storewb_pair<GPI:mode>_<P:mode>" @@ -1486,6 +1537,23 @@ [(set_attr "type" "neon_store1_2reg<q>")] ) +(define_insn "storewb_pair<TX:mode>_<P:mode>" + [(parallel + [(set (match_operand:P 0 "register_operand" "=&k") + (plus:P (match_operand:P 1 "register_operand" "0") + (match_operand:P 4 "aarch64_mem_pair_offset" "n"))) + (set (mem:TX (plus:P (match_dup 0) + (match_dup 4))) + (match_operand:TX 2 "register_operand" "w")) + (set (mem:TX (plus:P (match_dup 0) + (match_operand:P 5 "const_int_operand" "n"))) + (match_operand:TX 3 "register_operand" "w"))])] + "TARGET_SIMD && + INTVAL (operands[5]) == INTVAL (operands[4]) + GET_MODE_SIZE (<TX:MODE>mode)" + "stp\\t%q2, %q3, [%0, %4]!" + [(set_attr "type" "neon_stp_q")] +) + ;; ------------------------------------------------------------------- ;; Sign/Zero extension ;; ------------------------------------------------------------------- diff --git a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp index e69de29..22f08ff 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp +++ b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp @@ -0,0 +1,41 @@ +# Copyright (C) 2018 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +# GCC testsuite that uses the `gcc-dg.exp' driver, looping over +# optimization options. + +# Exit immediately if this isn't a Aarch64 target. +if { ![istarget aarch64*-*-*] } then { + return +} + +# Load support procs. +load_lib gcc-dg.exp + +# If a testcase doesn't have special options, use these. +global DEFAULT_CFLAGS +if ![info exists DEFAULT_CFLAGS] then { + set DEFAULT_CFLAGS " -ansi -pedantic-errors" +} + +# Initialize `dg'. +dg-init + +# Main loop. +gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] "" $DEFAULT_CFLAGS + +# All done. +dg-finish diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c index e69de29..249554e 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ + +void __attribute__ ((aarch64_vector_pcs)) +f (void) +{ + /* Clobber all fp/simd regs and verify that the correct ones are saved + and restored in the prologue and epilogue of a SIMD function. */ + __asm__ __volatile__ ("" ::: "q0", "q1", "q2", "q3"); + __asm__ __volatile__ ("" ::: "q4", "q5", "q6", "q7"); + __asm__ __volatile__ ("" ::: "q8", "q9", "q10", "q11"); + __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15"); + __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19"); + __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23"); + __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27"); + __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31"); +} + +/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */ +/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\td} } } */ +/* { dg-final { scan-assembler-not {\sldp\td} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c index e69de29..bf6e64a 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ + +void +f (void) +{ + /* Clobber all fp/simd regs and verify that the correct ones are saved + and restored in the prologue and epilogue of a SIMD function. */ + __asm__ __volatile__ ("" ::: "q0", "q1", "q2", "q3"); + __asm__ __volatile__ ("" ::: "q4", "q5", "q6", "q7"); + __asm__ __volatile__ ("" ::: "q8", "q9", "q10", "q11"); + __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15"); + __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19"); + __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23"); + __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27"); + __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31"); +} + +/* { dg-final { scan-assembler {\sstp\td8, d9} } } */ +/* { dg-final { scan-assembler {\sstp\td10, d11} } } */ +/* { dg-final { scan-assembler {\sstp\td12, d13} } } */ +/* { dg-final { scan-assembler {\sstp\td14, d15} } } */ +/* { dg-final { scan-assembler {\sldp\td8, d9} } } */ +/* { dg-final { scan-assembler {\sldp\td10, d11} } } */ +/* { dg-final { scan-assembler {\sldp\td12, d13} } } */ +/* { dg-final { scan-assembler {\sldp\td14, d15} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[01234567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[01234567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq1[6789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq1[6789]} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c index e69de29..7d4f54f 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ + +extern void g (void); + +void __attribute__ ((aarch64_vector_pcs)) +f (void) +{ + g(); +} + +/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */ +/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\td} } } */ +/* { dg-final { scan-assembler-not {\sldp\td} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c index e69de29..e399690 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c @@ -0,0 +1,34 @@ +/* dg-do run */ +/* { dg-additional-options "-std=c99" } */ + + + +/* There is nothing special about the calculations here, this is just + a test that can be compiled and run. */ + +extern void abort (void); + +__Float64x2_t __attribute__ ((noinline, aarch64_vector_pcs)) +foo(__Float64x2_t a, __Float64x2_t b, __Float64x2_t c, + __Float64x2_t d, __Float64x2_t e, __Float64x2_t f, + __Float64x2_t g, __Float64x2_t h, __Float64x2_t i) +{ + __Float64x2_t w, x, y, z; + w = a + b * c; + x = d + e * f; + y = g + h * i; + return w + x * y; +} + + +int main() +{ + __Float64x2_t a, b, c, d; + a = (__Float64x2_t) { 1.0, 2.0 }; + b = (__Float64x2_t) { 3.0, 4.0 }; + c = (__Float64x2_t) { 5.0, 6.0 }; + d = foo (a, b, c, (a+b), (b+c), (a+c), (a-b), (b-c), (a-c)) + a + b + c; + if (d[0] != 337.0 || d[1] != 554.0) + abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-5.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-5.c index e69de29..7d639a5e 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-5.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-5.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ + +void __attribute__ ((aarch64_vector_pcs)) +f (void) +{ + /* Clobber some fp/simd regs and verify that only those are saved + and restored in the prologue and epilogue of a SIMD function. */ + __asm__ __volatile__ ("" ::: "q8", "q9", "q10", "q11"); +} + +/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq1[23456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq1[23456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\td} } } */ +/* { dg-final { scan-assembler-not {\sldp\td} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp index e69de29..22f08ff 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp +++ b/gcc/testsuite/gcc.target/aarch64/torture/aarch64-torture.exp @@ -0,0 +1,41 @@ +# Copyright (C) 2018 Free Software Foundation, Inc. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# <http://www.gnu.org/licenses/>. + +# GCC testsuite that uses the `gcc-dg.exp' driver, looping over +# optimization options. + +# Exit immediately if this isn't a Aarch64 target. +if { ![istarget aarch64*-*-*] } then { + return +} + +# Load support procs. +load_lib gcc-dg.exp + +# If a testcase doesn't have special options, use these. +global DEFAULT_CFLAGS +if ![info exists DEFAULT_CFLAGS] then { + set DEFAULT_CFLAGS " -ansi -pedantic-errors" +} + +# Initialize `dg'. +dg-init + +# Main loop. +gcc-dg-runtest [lsort [glob -nocomplain $srcdir/$subdir/*.\[cS\]]] "" $DEFAULT_CFLAGS + +# All done. +dg-finish diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c index e69de29..249554e 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-1.c @@ -0,0 +1,41 @@ +/* { dg-do compile } */ + +void __attribute__ ((aarch64_vector_pcs)) +f (void) +{ + /* Clobber all fp/simd regs and verify that the correct ones are saved + and restored in the prologue and epilogue of a SIMD function. */ + __asm__ __volatile__ ("" ::: "q0", "q1", "q2", "q3"); + __asm__ __volatile__ ("" ::: "q4", "q5", "q6", "q7"); + __asm__ __volatile__ ("" ::: "q8", "q9", "q10", "q11"); + __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15"); + __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19"); + __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23"); + __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27"); + __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31"); +} + +/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */ +/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\td} } } */ +/* { dg-final { scan-assembler-not {\sldp\td} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c index e69de29..bf6e64a 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-2.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ + +void +f (void) +{ + /* Clobber all fp/simd regs and verify that the correct ones are saved + and restored in the prologue and epilogue of a SIMD function. */ + __asm__ __volatile__ ("" ::: "q0", "q1", "q2", "q3"); + __asm__ __volatile__ ("" ::: "q4", "q5", "q6", "q7"); + __asm__ __volatile__ ("" ::: "q8", "q9", "q10", "q11"); + __asm__ __volatile__ ("" ::: "q12", "q13", "q14", "q15"); + __asm__ __volatile__ ("" ::: "q16", "q17", "q18", "q19"); + __asm__ __volatile__ ("" ::: "q20", "q21", "q22", "q23"); + __asm__ __volatile__ ("" ::: "q24", "q25", "q26", "q27"); + __asm__ __volatile__ ("" ::: "q28", "q29", "q30", "q31"); +} + +/* { dg-final { scan-assembler {\sstp\td8, d9} } } */ +/* { dg-final { scan-assembler {\sstp\td10, d11} } } */ +/* { dg-final { scan-assembler {\sstp\td12, d13} } } */ +/* { dg-final { scan-assembler {\sstp\td14, d15} } } */ +/* { dg-final { scan-assembler {\sldp\td8, d9} } } */ +/* { dg-final { scan-assembler {\sldp\td10, d11} } } */ +/* { dg-final { scan-assembler {\sldp\td12, d13} } } */ +/* { dg-final { scan-assembler {\sldp\td14, d15} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[01234567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[01234567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq1[6789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq1[6789]} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c index e69de29..7d4f54f 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-3.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ + +extern void g (void); + +void __attribute__ ((aarch64_vector_pcs)) +f (void) +{ + g(); +} + +/* { dg-final { scan-assembler {\sstp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sstp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sstp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sstp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sstp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sstp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sstp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sstp\tq22, q23} } } */ +/* { dg-final { scan-assembler {\sldp\tq8, q9} } } */ +/* { dg-final { scan-assembler {\sldp\tq10, q11} } } */ +/* { dg-final { scan-assembler {\sldp\tq12, q13} } } */ +/* { dg-final { scan-assembler {\sldp\tq14, q15} } } */ +/* { dg-final { scan-assembler {\sldp\tq16, q17} } } */ +/* { dg-final { scan-assembler {\sldp\tq18, q19} } } */ +/* { dg-final { scan-assembler {\sldp\tq20, q21} } } */ +/* { dg-final { scan-assembler {\sldp\tq22, q23} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq[034567]} } } */ +/* { dg-final { scan-assembler-not {\sstp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sldp\tq2[456789]} } } */ +/* { dg-final { scan-assembler-not {\sstp\td} } } */ +/* { dg-final { scan-assembler-not {\sldp\td} } } */ +/* { dg-final { scan-assembler-not {\sstr\t} } } */ +/* { dg-final { scan-assembler-not {\sldr\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c index e69de29..e399690 100644 --- a/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c +++ b/gcc/testsuite/gcc.target/aarch64/torture/simd-abi-4.c @@ -0,0 +1,34 @@ +/* dg-do run */ +/* { dg-additional-options "-std=c99" } */ + + + +/* There is nothing special about the calculations here, this is just + a test that can be compiled and run. */ + +extern void abort (void); + +__Float64x2_t __attribute__ ((noinline, aarch64_vector_pcs)) +foo(__Float64x2_t a, __Float64x2_t b, __Float64x2_t c, + __Float64x2_t d, __Float64x2_t e, __Float64x2_t f, + __Float64x2_t g, __Float64x2_t h, __Float64x2_t i) +{ + __Float64x2_t w, x, y, z; + w = a + b * c; + x = d + e * f; + y = g + h * i; + return w + x * y; +} + + +int main() +{ + __Float64x2_t a, b, c, d; + a = (__Float64x2_t) { 1.0, 2.0 }; + b = (__Float64x2_t) { 3.0, 4.0 }; + c = (__Float64x2_t) { 5.0, 6.0 }; + d = foo (a, b, c, (a+b), (b+c), (a+c), (a-b), (b-c), (a-c)) + a + b + c; + if (d[0] != 337.0 || d[1] != 554.0) + abort (); + return 0; +}