@@ -13917,6 +13917,16 @@ of available registers reserved for some other purposes is given
by this parameter. Default of the parameter
is the best found from numerous experiments.
+@item ira-consider-dup-in-all-alts
+Make IRA to consider matching constraint (duplicated operand number)
+heavily in all available alternatives for preferred register class.
+If it is set as zero, it means IRA only respects the matching
+constraint when it's in the only available alternative with an
+appropriate register class. Otherwise, it means IRA will check all
+available alternatives for preferred register class even if it has
+found some choice with an appropriate register class and respect the
+found qualified matching constraint.
+
@item lra-inheritance-ebb-probability-cutoff
LRA tries to reuse values reloaded in registers in subsequent insns.
This optimization is called inheritance. EBB is used as a region to
@@ -233,19 +233,30 @@ go_through_subreg (rtx x, int *offset)
return reg;
}
+/* Return the recomputed frequency for this shuffle copy or its similar
+ case, since it's not for a real move insn, make it smaller. */
+
+static int
+get_freq_for_shuffle_copy (int freq)
+{
+ return freq < 8 ? 1 : freq / 8;
+}
+
/* Process registers REG1 and REG2 in move INSN with execution
frequency FREQ. The function also processes the registers in a
potential move insn (INSN == NULL in this case) with frequency
FREQ. The function can modify hard register costs of the
corresponding allocnos or create a copy involving the corresponding
allocnos. The function does nothing if the both registers are hard
- registers. When nothing is changed, the function returns
- FALSE. */
+ registers. When nothing is changed, the function returns FALSE.
+ SINGLE_INPUT_OP_HAS_CSTR_P is only meaningful when constraint_p
+ is true, see function ira_get_dup_out_num for its meaning. */
static bool
-process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p,
- rtx_insn *insn, int freq)
+process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p, rtx_insn *insn,
+ int freq, bool single_input_op_has_cstr_p = true)
{
- int allocno_preferenced_hard_regno, cost, index, offset1, offset2;
+ int allocno_preferenced_hard_regno, index, offset1, offset2;
+ int cost, conflict_cost, move_cost;
bool only_regs_p;
ira_allocno_t a;
reg_class_t rclass, aclass;
@@ -306,9 +317,52 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p,
return false;
ira_init_register_move_cost_if_necessary (mode);
if (HARD_REGISTER_P (reg1))
- cost = ira_register_move_cost[mode][aclass][rclass] * freq;
+ move_cost = ira_register_move_cost[mode][aclass][rclass];
+ else
+ move_cost = ira_register_move_cost[mode][rclass][aclass];
+
+ if (!single_input_op_has_cstr_p)
+ {
+ /* When this is a constraint copy and the matching constraint
+ doesn't only exist for this given operand but also for some
+ other operand(s), it means saving the possible move cost does
+ NOT need to require reg1 and reg2 to use the same hardware
+ register, so this hardware preference isn't required to be
+ fixed. To avoid it to over prefer this hardware register,
+ and over disparage this hardware register on conflicted
+ objects, we need some cost tweaking here, similar to what
+ we do for shuffle copy. */
+ gcc_assert (constraint_p);
+ int reduced_freq = get_freq_for_shuffle_copy (freq);
+ if (HARD_REGISTER_P (reg1))
+ /* For reg2 = opcode(reg1, reg3 ...), assume that reg3 is a
+ pseudo register which has matching constraint on reg2,
+ even if reg2 isn't assigned by reg1, it's still possible
+ not to have register moves if reg2 and reg3 use the same
+ hardware register. So to avoid the allocation to over
+ prefer reg1, we can just take it as a shuffle copy. */
+ cost = conflict_cost = move_cost * reduced_freq;
+ else
+ {
+ /* For reg1 = opcode(reg2, reg3 ...), assume that reg3 is a
+ pseudo register which has matching constraint on reg2,
+ to save the register move, it's better to assign reg1
+ to either of reg2 and reg3 (or one of other pseudos like
+ reg3), it's reasonable to use freq for the cost. But
+ for conflict_cost, since reg2 and reg3 conflicts with
+ each other, both of them has the chance to be assigned
+ by reg1, assume reg3 has one copy which also conflicts
+ with reg2, we shouldn't make it less preferred on reg1
+ since reg3 has the same chance to be assigned by reg1.
+ So it adjusts the conflic_cost to make it same as what
+ we use for shuffle copy. */
+ cost = move_cost * freq;
+ conflict_cost = move_cost * reduced_freq;
+ }
+ }
else
- cost = ira_register_move_cost[mode][rclass][aclass] * freq;
+ cost = conflict_cost = move_cost * freq;
+
do
{
ira_allocate_and_set_costs
@@ -317,7 +371,7 @@ process_regs_for_copy (rtx reg1, rtx reg2, bool constraint_p,
ira_allocate_and_set_costs
(&ALLOCNO_CONFLICT_HARD_REG_COSTS (a), aclass, 0);
ALLOCNO_HARD_REG_COSTS (a)[index] -= cost;
- ALLOCNO_CONFLICT_HARD_REG_COSTS (a)[index] -= cost;
+ ALLOCNO_CONFLICT_HARD_REG_COSTS (a)[index] -= conflict_cost;
if (ALLOCNO_HARD_REG_COSTS (a)[index] < ALLOCNO_CLASS_COST (a))
ALLOCNO_CLASS_COST (a) = ALLOCNO_HARD_REG_COSTS (a)[index];
ira_add_allocno_pref (a, allocno_preferenced_hard_regno, freq);
@@ -420,7 +474,8 @@ add_insn_allocno_copies (rtx_insn *insn)
operand = recog_data.operand[i];
if (! REG_SUBREG_P (operand))
continue;
- if ((n = ira_get_dup_out_num (i, alts)) >= 0)
+ bool single_input_op_has_cstr_p;
+ if ((n = ira_get_dup_out_num (i, alts, single_input_op_has_cstr_p)) >= 0)
{
bound_p[n] = true;
dup = recog_data.operand[n];
@@ -429,8 +484,8 @@ add_insn_allocno_copies (rtx_insn *insn)
REG_P (operand)
? operand
: SUBREG_REG (operand)) != NULL_RTX)
- process_regs_for_copy (operand, dup, true, NULL,
- freq);
+ process_regs_for_copy (operand, dup, true, NULL, freq,
+ single_input_op_has_cstr_p);
}
}
for (i = 0; i < recog_data.n_operands; i++)
@@ -440,13 +495,15 @@ add_insn_allocno_copies (rtx_insn *insn)
&& find_reg_note (insn, REG_DEAD,
REG_P (operand)
? operand : SUBREG_REG (operand)) != NULL_RTX)
- /* If an operand dies, prefer its hard register for the output
- operands by decreasing the hard register cost or creating
- the corresponding allocno copies. The cost will not
- correspond to a real move insn cost, so make the frequency
- smaller. */
- process_reg_shuffles (insn, operand, i, freq < 8 ? 1 : freq / 8,
- bound_p);
+ {
+ /* If an operand dies, prefer its hard register for the output
+ operands by decreasing the hard register cost or creating
+ the corresponding allocno copies. The cost will not
+ correspond to a real move insn cost, so make the frequency
+ smaller. */
+ int new_freq = get_freq_for_shuffle_copy (freq);
+ process_reg_shuffles (insn, operand, i, new_freq, bound_p);
+ }
}
}
@@ -971,7 +971,7 @@ extern void ira_debug_disposition (void);
extern void ira_debug_allocno_classes (void);
extern void ira_init_register_move_cost (machine_mode);
extern alternative_mask ira_setup_alts (rtx_insn *);
-extern int ira_get_dup_out_num (int, alternative_mask);
+extern int ira_get_dup_out_num (int, alternative_mask, bool &);
/* ira-build.c */
@@ -1922,9 +1922,25 @@ ira_setup_alts (rtx_insn *insn)
/* Return the number of the output non-early clobber operand which
should be the same in any case as operand with number OP_NUM (or
negative value if there is no such operand). ALTS is the mask
- of alternatives that we should consider. */
+ of alternatives that we should consider. SINGLE_INPUT_OP_HAS_CSTR_P
+ should be set in this function, it indicates whether there is only
+ a single input operand which has the matching constraint on the
+ output operand at the position specified in return value. If the
+ pattern allows any one of several input operands holds the matching
+ constraint, it's set as false, one typical case is destructive FMA
+ instruction on target rs6000. Note that for a non-NO_REG preferred
+ register class with no free register move copy, if the parameter
+ PARAM_IRA_CONSIDER_DUP_IN_ALL_ALTS is set to one, this function
+ will check all available alternatives for matching constraints,
+ even if it has found or will find one alternative with non-NO_REG
+ regclass, it can respect more cases with matching constraints. If
+ PARAM_IRA_CONSIDER_DUP_IN_ALL_ALTS is set to zero,
+ SINGLE_INPUT_OP_HAS_CSTR_P is always true, it will stop to find
+ matching constraint relationship once it hits some alternative with
+ some non-NO_REG regclass. */
int
-ira_get_dup_out_num (int op_num, alternative_mask alts)
+ira_get_dup_out_num (int op_num, alternative_mask alts,
+ bool &single_input_op_has_cstr_p)
{
int curr_alt, c, original;
bool ignore_p, use_commut_op_p;
@@ -1937,10 +1953,42 @@ ira_get_dup_out_num (int op_num, alternative_mask alts)
return -1;
str = recog_data.constraints[op_num];
use_commut_op_p = false;
+ single_input_op_has_cstr_p = true;
+
+ rtx op = recog_data.operand[op_num];
+ int op_regno = reg_or_subregno (op);
+ enum reg_class op_pref_cl = reg_preferred_class (op_regno);
+ machine_mode op_mode = GET_MODE (op);
+
+ ira_init_register_move_cost_if_necessary (op_mode);
+ /* If the preferred regclass isn't NO_REG, continue to find the matching
+ constraint in all available alternatives with preferred regclass, even
+ if we have found or will find one alternative whose constraint stands
+ for a REG (non-NO_REG) regclass. Note that it would be fine not to
+ respect matching constraint if the register copy is free, so exclude
+ it. */
+ bool respect_dup_despite_reg_cstr
+ = param_ira_consider_dup_in_all_alts
+ && op_pref_cl != NO_REGS
+ && ira_register_move_cost[op_mode][op_pref_cl][op_pref_cl] > 0;
+
+ /* Record the alternative whose constraint uses the same regclass as the
+ preferred regclass, later if we find one matching constraint for this
+ operand with preferred reclass, we will visit these recorded
+ alternatives to check whether if there is one alternative in which no
+ any INPUT operands have one matching constraint same as our candidate.
+ If yes, it means there is one alternative which is perfectly fine
+ without satisfying this matching constraint. If no, it means in any
+ alternatives there is one other INPUT operand holding this matching
+ constraint, it's fine to respect this matching constraint and further
+ create this constraint copy since it would become harmless once some
+ other takes preference and it's interfered. */
+ alternative_mask pref_cl_alts;
+
for (;;)
{
- rtx op = recog_data.operand[op_num];
-
+ pref_cl_alts = 0;
+
for (curr_alt = 0, ignore_p = !TEST_BIT (alts, curr_alt),
original = -1;;)
{
@@ -1963,9 +2011,25 @@ ira_get_dup_out_num (int op_num, alternative_mask alts)
{
enum constraint_num cn = lookup_constraint (str);
enum reg_class cl = reg_class_for_constraint (cn);
- if (cl != NO_REGS
- && !targetm.class_likely_spilled_p (cl))
- goto fail;
+ if (cl != NO_REGS && !targetm.class_likely_spilled_p (cl))
+ {
+ if (respect_dup_despite_reg_cstr)
+ {
+ /* If it's free to move from one preferred class to
+ the one without matching constraint, it doesn't
+ have to respect this constraint with costs. */
+ if (cl != op_pref_cl
+ && (ira_reg_class_intersect[cl][op_pref_cl]
+ != NO_REGS)
+ && (ira_may_move_in_cost[op_mode][op_pref_cl][cl]
+ == 0))
+ goto fail;
+ else if (cl == op_pref_cl)
+ pref_cl_alts |= ALTERNATIVE_BIT (curr_alt);
+ }
+ else
+ goto fail;
+ }
if (constraint_satisfied_p (op, cn))
goto fail;
break;
@@ -1979,7 +2043,21 @@ ira_get_dup_out_num (int op_num, alternative_mask alts)
str = end;
if (original != -1 && original != n)
goto fail;
- original = n;
+ gcc_assert (n < recog_data.n_operands);
+ if (respect_dup_despite_reg_cstr)
+ {
+ const operand_alternative *op_alt
+ = &recog_op_alt[curr_alt * recog_data.n_operands];
+ /* Only respect the one with preferred rclass, without
+ respect_dup_despite_reg_cstr it's possible to get
+ one whose regclass isn't preferred first before,
+ but it would fail since there should be other
+ alternatives with preferred regclass. */
+ if (op_alt[n].cl == op_pref_cl)
+ original = n;
+ }
+ else
+ original = n;
continue;
}
}
@@ -1988,7 +2066,39 @@ ira_get_dup_out_num (int op_num, alternative_mask alts)
if (original == -1)
goto fail;
if (recog_data.operand_type[original] == OP_OUT)
- return original;
+ {
+ if (pref_cl_alts == 0)
+ return original;
+ /* Visit these recorded alternatives to check whether
+ there is one alternative in which no any INPUT operands
+ have one matching constraint same as our candidate.
+ Give up this candidate if so. */
+ int nop, nalt;
+ for (nalt = 0; nalt < recog_data.n_alternatives; nalt++)
+ {
+ if (!TEST_BIT (pref_cl_alts, nalt))
+ continue;
+ const operand_alternative *op_alt
+ = &recog_op_alt[nalt * recog_data.n_operands];
+ bool dup_in_other = false;
+ for (nop = 0; nop < recog_data.n_operands; nop++)
+ {
+ if (recog_data.operand_type[nop] != OP_IN)
+ continue;
+ if (nop == op_num)
+ continue;
+ if (op_alt[nop].matches == original)
+ {
+ dup_in_other = true;
+ break;
+ }
+ }
+ if (!dup_in_other)
+ return -1;
+ }
+ single_input_op_has_cstr_p = false;
+ return original;
+ }
fail:
if (use_commut_op_p)
break;
@@ -330,6 +330,10 @@ Max size of conflict table in MB.
Common Joined UInteger Var(param_ira_max_loops_num) Init(100) Param Optimization
Max loops number for regional RA.
+-param=ira-consider-dup-in-all-alts=
+Common Joined UInteger Var(param_ira_consider_dup_in_all_alts) Init(1) IntegerRange(0, 1) Param Optimization
+Control ira to consider matching constraint (duplicated operand number) heavily in all available alternatives for preferred register class. If it is set as zero, it means ira only respects the matching constraint when it's in the only available alternative with an appropriate register class. Otherwise, it means ira will check all available alternatives for preferred register class even if it has found some choice with an appropriate register class and respect the found qualified matching constraint.
+
-param=iv-always-prune-cand-set-bound=
Common Joined UInteger Var(param_iv_always_prune_cand_set_bound) Init(10) Param Optimization
If number of candidates in the set is smaller, we always try to remove unused ivs during its optimization.
@@ -218,7 +218,7 @@ TEST_UNIFORM_ZD (div_h4_f16_x_tied1, svfloat16_t, __fp16,
z0 = svdiv_x (p0, z0, d4))
/*
-** div_h4_f16_x_untied: { xfail *-*-* }
+** div_h4_f16_x_untied:
** mov z0\.h, h4
** fdivr z0\.h, p0/m, z0\.h, z1\.h
** ret
@@ -218,7 +218,7 @@ TEST_UNIFORM_ZD (div_s4_f32_x_tied1, svfloat32_t, float,
z0 = svdiv_x (p0, z0, d4))
/*
-** div_s4_f32_x_untied: { xfail *-*-* }
+** div_s4_f32_x_untied:
** mov z0\.s, s4
** fdivr z0\.s, p0/m, z0\.s, z1\.s
** ret
@@ -218,7 +218,7 @@ TEST_UNIFORM_ZD (div_d4_f64_x_tied1, svfloat64_t, double,
z0 = svdiv_x (p0, z0, d4))
/*
-** div_d4_f64_x_untied: { xfail *-*-* }
+** div_d4_f64_x_untied:
** mov z0\.d, d4
** fdivr z0\.d, p0/m, z0\.d, z1\.d
** ret
@@ -239,7 +239,7 @@ TEST_UNIFORM_ZD (divr_h4_f16_x_tied1, svfloat16_t, __fp16,
z0 = svdivr_x (p0, z0, d4))
/*
-** divr_h4_f16_x_untied: { xfail *-*-* }
+** divr_h4_f16_x_untied:
** mov z0\.h, h4
** fdiv z0\.h, p0/m, z0\.h, z1\.h
** ret
@@ -239,7 +239,7 @@ TEST_UNIFORM_ZD (divr_s4_f32_x_tied1, svfloat32_t, float,
z0 = svdivr_x (p0, z0, d4))
/*
-** divr_s4_f32_x_untied: { xfail *-*-* }
+** divr_s4_f32_x_untied:
** mov z0\.s, s4
** fdiv z0\.s, p0/m, z0\.s, z1\.s
** ret
@@ -239,7 +239,7 @@ TEST_UNIFORM_ZD (divr_d4_f64_x_tied1, svfloat64_t, double,
z0 = svdivr_x (p0, z0, d4))
/*
-** divr_d4_f64_x_untied: { xfail *-*-* }
+** divr_d4_f64_x_untied:
** mov z0\.d, d4
** fdiv z0\.d, p0/m, z0\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mad_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svmad_x (p0, z1, z0, d4))
/*
-** mad_h4_f16_x_untied: { xfail *-*-* }
+** mad_h4_f16_x_untied:
** mov z0\.h, h4
** fmla z0\.h, p0/m, z1\.h, z2\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mad_s4_f32_x_tied2, svfloat32_t, float,
z0 = svmad_x (p0, z1, z0, d4))
/*
-** mad_s4_f32_x_untied: { xfail *-*-* }
+** mad_s4_f32_x_untied:
** mov z0\.s, s4
** fmla z0\.s, p0/m, z1\.s, z2\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mad_d4_f64_x_tied2, svfloat64_t, double,
z0 = svmad_x (p0, z1, z0, d4))
/*
-** mad_d4_f64_x_untied: { xfail *-*-* }
+** mad_d4_f64_x_untied:
** mov z0\.d, d4
** fmla z0\.d, p0/m, z1\.d, z2\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mla_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svmla_x (p0, z1, z0, d4))
/*
-** mla_h4_f16_x_untied: { xfail *-*-* }
+** mla_h4_f16_x_untied:
** mov z0\.h, h4
** fmad z0\.h, p0/m, z2\.h, z1\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mla_s4_f32_x_tied2, svfloat32_t, float,
z0 = svmla_x (p0, z1, z0, d4))
/*
-** mla_s4_f32_x_untied: { xfail *-*-* }
+** mla_s4_f32_x_untied:
** mov z0\.s, s4
** fmad z0\.s, p0/m, z2\.s, z1\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mla_d4_f64_x_tied2, svfloat64_t, double,
z0 = svmla_x (p0, z1, z0, d4))
/*
-** mla_d4_f64_x_untied: { xfail *-*-* }
+** mla_d4_f64_x_untied:
** mov z0\.d, d4
** fmad z0\.d, p0/m, z2\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mls_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svmls_x (p0, z1, z0, d4))
/*
-** mls_h4_f16_x_untied: { xfail *-*-* }
+** mls_h4_f16_x_untied:
** mov z0\.h, h4
** fmsb z0\.h, p0/m, z2\.h, z1\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mls_s4_f32_x_tied2, svfloat32_t, float,
z0 = svmls_x (p0, z1, z0, d4))
/*
-** mls_s4_f32_x_untied: { xfail *-*-* }
+** mls_s4_f32_x_untied:
** mov z0\.s, s4
** fmsb z0\.s, p0/m, z2\.s, z1\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (mls_d4_f64_x_tied2, svfloat64_t, double,
z0 = svmls_x (p0, z1, z0, d4))
/*
-** mls_d4_f64_x_untied: { xfail *-*-* }
+** mls_d4_f64_x_untied:
** mov z0\.d, d4
** fmsb z0\.d, p0/m, z2\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (msb_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svmsb_x (p0, z1, z0, d4))
/*
-** msb_h4_f16_x_untied: { xfail *-*-* }
+** msb_h4_f16_x_untied:
** mov z0\.h, h4
** fmls z0\.h, p0/m, z1\.h, z2\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (msb_s4_f32_x_tied2, svfloat32_t, float,
z0 = svmsb_x (p0, z1, z0, d4))
/*
-** msb_s4_f32_x_untied: { xfail *-*-* }
+** msb_s4_f32_x_untied:
** mov z0\.s, s4
** fmls z0\.s, p0/m, z1\.s, z2\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (msb_d4_f64_x_tied2, svfloat64_t, double,
z0 = svmsb_x (p0, z1, z0, d4))
/*
-** msb_d4_f64_x_untied: { xfail *-*-* }
+** msb_d4_f64_x_untied:
** mov z0\.d, d4
** fmls z0\.d, p0/m, z1\.d, z2\.d
** ret
@@ -303,7 +303,7 @@ TEST_UNIFORM_ZD (mulx_h4_f16_x_tied1, svfloat16_t, __fp16,
z0 = svmulx_x (p0, z0, d4))
/*
-** mulx_h4_f16_x_untied: { xfail *-*-* }
+** mulx_h4_f16_x_untied:
** mov z0\.h, h4
** fmulx z0\.h, p0/m, z0\.h, z1\.h
** ret
@@ -303,7 +303,7 @@ TEST_UNIFORM_ZD (mulx_s4_f32_x_tied1, svfloat32_t, float,
z0 = svmulx_x (p0, z0, d4))
/*
-** mulx_s4_f32_x_untied: { xfail *-*-* }
+** mulx_s4_f32_x_untied:
** mov z0\.s, s4
** fmulx z0\.s, p0/m, z0\.s, z1\.s
** ret
@@ -303,7 +303,7 @@ TEST_UNIFORM_ZD (mulx_d4_f64_x_tied1, svfloat64_t, double,
z0 = svmulx_x (p0, z0, d4))
/*
-** mulx_d4_f64_x_untied: { xfail *-*-* }
+** mulx_d4_f64_x_untied:
** mov z0\.d, d4
** fmulx z0\.d, p0/m, z0\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmad_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svnmad_x (p0, z1, z0, d4))
/*
-** nmad_h4_f16_x_untied: { xfail *-*-* }
+** nmad_h4_f16_x_untied:
** mov z0\.h, h4
** fnmla z0\.h, p0/m, z1\.h, z2\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmad_s4_f32_x_tied2, svfloat32_t, float,
z0 = svnmad_x (p0, z1, z0, d4))
/*
-** nmad_s4_f32_x_untied: { xfail *-*-* }
+** nmad_s4_f32_x_untied:
** mov z0\.s, s4
** fnmla z0\.s, p0/m, z1\.s, z2\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmad_d4_f64_x_tied2, svfloat64_t, double,
z0 = svnmad_x (p0, z1, z0, d4))
/*
-** nmad_d4_f64_x_untied: { xfail *-*-* }
+** nmad_d4_f64_x_untied:
** mov z0\.d, d4
** fnmla z0\.d, p0/m, z1\.d, z2\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmla_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svnmla_x (p0, z1, z0, d4))
/*
-** nmla_h4_f16_x_untied: { xfail *-*-* }
+** nmla_h4_f16_x_untied:
** mov z0\.h, h4
** fnmad z0\.h, p0/m, z2\.h, z1\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmla_s4_f32_x_tied2, svfloat32_t, float,
z0 = svnmla_x (p0, z1, z0, d4))
/*
-** nmla_s4_f32_x_untied: { xfail *-*-* }
+** nmla_s4_f32_x_untied:
** mov z0\.s, s4
** fnmad z0\.s, p0/m, z2\.s, z1\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmla_d4_f64_x_tied2, svfloat64_t, double,
z0 = svnmla_x (p0, z1, z0, d4))
/*
-** nmla_d4_f64_x_untied: { xfail *-*-* }
+** nmla_d4_f64_x_untied:
** mov z0\.d, d4
** fnmad z0\.d, p0/m, z2\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmls_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svnmls_x (p0, z1, z0, d4))
/*
-** nmls_h4_f16_x_untied: { xfail *-*-* }
+** nmls_h4_f16_x_untied:
** mov z0\.h, h4
** fnmsb z0\.h, p0/m, z2\.h, z1\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmls_s4_f32_x_tied2, svfloat32_t, float,
z0 = svnmls_x (p0, z1, z0, d4))
/*
-** nmls_s4_f32_x_untied: { xfail *-*-* }
+** nmls_s4_f32_x_untied:
** mov z0\.s, s4
** fnmsb z0\.s, p0/m, z2\.s, z1\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmls_d4_f64_x_tied2, svfloat64_t, double,
z0 = svnmls_x (p0, z1, z0, d4))
/*
-** nmls_d4_f64_x_untied: { xfail *-*-* }
+** nmls_d4_f64_x_untied:
** mov z0\.d, d4
** fnmsb z0\.d, p0/m, z2\.d, z1\.d
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmsb_h4_f16_x_tied2, svfloat16_t, __fp16,
z0 = svnmsb_x (p0, z1, z0, d4))
/*
-** nmsb_h4_f16_x_untied: { xfail *-*-* }
+** nmsb_h4_f16_x_untied:
** mov z0\.h, h4
** fnmls z0\.h, p0/m, z1\.h, z2\.h
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmsb_s4_f32_x_tied2, svfloat32_t, float,
z0 = svnmsb_x (p0, z1, z0, d4))
/*
-** nmsb_s4_f32_x_untied: { xfail *-*-* }
+** nmsb_s4_f32_x_untied:
** mov z0\.s, s4
** fnmls z0\.s, p0/m, z1\.s, z2\.s
** ret
@@ -281,7 +281,7 @@ TEST_UNIFORM_ZD (nmsb_d4_f64_x_tied2, svfloat64_t, double,
z0 = svnmsb_x (p0, z1, z0, d4))
/*
-** nmsb_d4_f64_x_untied: { xfail *-*-* }
+** nmsb_d4_f64_x_untied:
** mov z0\.d, d4
** fnmls z0\.d, p0/m, z1\.d, z2\.d
** ret
@@ -336,7 +336,7 @@ TEST_UNIFORM_ZD (sub_h4_f16_x_tied1, svfloat16_t, __fp16,
z0 = svsub_x (p0, z0, d4))
/*
-** sub_h4_f16_x_untied: { xfail *-*-* }
+** sub_h4_f16_x_untied:
** mov z0\.h, h4
** fsubr z0\.h, p0/m, z0\.h, z1\.h
** ret
@@ -336,7 +336,7 @@ TEST_UNIFORM_ZD (sub_s4_f32_x_tied1, svfloat32_t, float,
z0 = svsub_x (p0, z0, d4))
/*
-** sub_s4_f32_x_untied: { xfail *-*-* }
+** sub_s4_f32_x_untied:
** mov z0\.s, s4
** fsubr z0\.s, p0/m, z0\.s, z1\.s
** ret
@@ -336,7 +336,7 @@ TEST_UNIFORM_ZD (sub_d4_f64_x_tied1, svfloat64_t, double,
z0 = svsub_x (p0, z0, d4))
/*
-** sub_d4_f64_x_untied: { xfail *-*-* }
+** sub_d4_f64_x_untied:
** mov z0\.d, d4
** fsubr z0\.d, p0/m, z0\.d, z1\.d
** ret
@@ -285,7 +285,7 @@ TEST_UNIFORM_ZD (subr_h4_f16_x_tied1, svfloat16_t, __fp16,
z0 = svsubr_x (p0, z0, d4))
/*
-** subr_h4_f16_x_untied: { xfail *-*-* }
+** subr_h4_f16_x_untied:
** mov z0\.h, h4
** fsub z0\.h, p0/m, z0\.h, z1\.h
** ret
@@ -285,7 +285,7 @@ TEST_UNIFORM_ZD (subr_s4_f32_x_tied1, svfloat32_t, float,
z0 = svsubr_x (p0, z0, d4))
/*
-** subr_s4_f32_x_untied: { xfail *-*-* }
+** subr_s4_f32_x_untied:
** mov z0\.s, s4
** fsub z0\.s, p0/m, z0\.s, z1\.s
** ret
@@ -285,7 +285,7 @@ TEST_UNIFORM_ZD (subr_d4_f64_x_tied1, svfloat64_t, double,
z0 = svsubr_x (p0, z0, d4))
/*
-** subr_d4_f64_x_untied: { xfail *-*-* }
+** subr_d4_f64_x_untied:
** mov z0\.d, d4
** fsub z0\.d, p0/m, z0\.d, z1\.d
** ret