@@ -798,6 +798,7 @@ bool aarch64_sve_mode_p (machine_mode);
HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
bool aarch64_sve_cnt_immediate_p (rtx);
bool aarch64_sve_scalar_inc_dec_immediate_p (rtx);
+bool aarch64_sve_rdvl_immediate_p (rtx);
bool aarch64_sve_addvl_addpl_immediate_p (rtx);
bool aarch64_sve_vector_inc_dec_immediate_p (rtx);
int aarch64_add_offset_temporaries (rtx);
@@ -810,6 +811,7 @@ char *aarch64_output_sve_prefetch (const char *, rtx, const char *);
char *aarch64_output_sve_cnt_immediate (const char *, const char *, rtx);
char *aarch64_output_sve_cnt_pat_immediate (const char *, const char *, rtx *);
char *aarch64_output_sve_scalar_inc_dec (rtx);
+char *aarch64_output_sve_rdvl (rtx);
char *aarch64_output_sve_addvl_addpl (rtx);
char *aarch64_output_sve_vector_inc_dec (const char *, rtx);
char *aarch64_output_scalar_simd_mov_immediate (rtx, scalar_int_mode);
@@ -2933,6 +2933,18 @@ aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
return -1;
}
+/* Return true if a single CNT[BHWD] instruction can multiply FACTOR
+ by the number of 128-bit quadwords in an SVE vector. */
+
+static bool
+aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
+{
+ /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
+ return (IN_RANGE (factor, 2, 16 * 16)
+ && (factor & 1) == 0
+ && factor <= 16 * (factor & -factor));
+}
+
/* Return true if we can move VALUE into a register using a single
CNT[BHWD] instruction. */
@@ -2940,11 +2952,7 @@ static bool
aarch64_sve_cnt_immediate_p (poly_int64 value)
{
HOST_WIDE_INT factor = value.coeffs[0];
- /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
- return (value.coeffs[1] == factor
- && IN_RANGE (factor, 2, 16 * 16)
- && (factor & 1) == 0
- && factor <= 16 * (factor & -factor));
+ return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
}
/* Likewise for rtx X. */
@@ -3060,6 +3068,50 @@ aarch64_output_sve_scalar_inc_dec (rtx offset)
-offset_value.coeffs[1], 0);
}
+/* Return true if a single RDVL instruction can multiply FACTOR by the
+ number of 128-bit quadwords in an SVE vector. */
+
+static bool
+aarch64_sve_rdvl_factor_p (HOST_WIDE_INT factor)
+{
+ return (multiple_p (factor, 16)
+ && IN_RANGE (factor, -32 * 16, 31 * 16));
+}
+
+/* Return true if we can move VALUE into a register using a single
+ RDVL instruction. */
+
+static bool
+aarch64_sve_rdvl_immediate_p (poly_int64 value)
+{
+ HOST_WIDE_INT factor = value.coeffs[0];
+ return value.coeffs[1] == factor && aarch64_sve_rdvl_factor_p (factor);
+}
+
+/* Likewise for rtx X. */
+
+bool
+aarch64_sve_rdvl_immediate_p (rtx x)
+{
+ poly_int64 value;
+ return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
+}
+
+/* Return the asm string for moving RDVL immediate OFFSET into register
+ operand 0. */
+
+char *
+aarch64_output_sve_rdvl (rtx offset)
+{
+ static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
+ poly_int64 offset_value = rtx_to_poly_int64 (offset);
+ gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
+
+ int factor = offset_value.coeffs[1];
+ snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
+ return buffer;
+}
+
/* Return true if we can add VALUE to a register using a single ADDVL
or ADDPL instruction. */
@@ -3689,13 +3741,13 @@ aarch64_offset_temporaries (bool add_p, poly_int64 offset)
count += 1;
else if (factor != 0)
{
- factor = abs (factor);
- if (factor > 16 * (factor & -factor))
- /* Need one register for the CNT result and one for the multiplication
- factor. If necessary, the second temporary can be reused for the
- constant part of the offset. */
+ factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
+ if (!IN_RANGE (factor, -32, 31))
+ /* Need one register for the CNT or RDVL result and one for the
+ multiplication factor. If necessary, the second temporary
+ can be reused for the constant part of the offset. */
return 2;
- /* Need one register for the CNT result (which might then
+ /* Need one register for the CNT or RDVL result (which might then
be shifted). */
count += 1;
}
@@ -3784,85 +3836,100 @@ aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
/* Otherwise use a CNT-based sequence. */
else if (factor != 0)
{
- /* Use a subtraction if we have a negative factor. */
- rtx_code code = PLUS;
- if (factor < 0)
- {
- factor = -factor;
- code = MINUS;
- }
+ /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
+ with negative shifts indicating a shift right. */
+ HOST_WIDE_INT low_bit = least_bit_hwi (factor);
+ HOST_WIDE_INT rel_factor = factor / low_bit;
+ int shift = exact_log2 (low_bit) - 4;
+ gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
+
+ /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
+ equal to CNTB * FACTOR / 16, with CODE being the [+-].
- /* Calculate CNTD * FACTOR / 2. First try to fold the division
- into the multiplication. */
+ We can avoid a multiplication if REL_FACTOR is in the range
+ of RDVL, although there are then various optimizations that
+ we can try on top. */
+ rtx_code code = PLUS;
rtx val;
- int shift = 0;
- if (factor & 1)
- /* Use a right shift by 1. */
- shift = -1;
- else
- factor /= 2;
- HOST_WIDE_INT low_bit = factor & -factor;
- if (factor <= 16 * low_bit)
+ if (IN_RANGE (rel_factor, -32, 31))
{
- if (factor > 16 * 8)
+ /* Try to use an unshifted CNT[BHWD] or RDVL. */
+ if (aarch64_sve_cnt_factor_p (factor)
+ || aarch64_sve_rdvl_factor_p (factor))
+ {
+ val = gen_int_mode (poly_int64 (factor, factor), mode);
+ shift = 0;
+ }
+ /* Try to subtract an unshifted CNT[BHWD]. */
+ else if (aarch64_sve_cnt_factor_p (-factor))
{
- /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
- the value with the minimum multiplier and shift it into
- position. */
- int extra_shift = exact_log2 (low_bit);
- shift += extra_shift;
- factor >>= extra_shift;
+ code = MINUS;
+ val = gen_int_mode (poly_int64 (-factor, -factor), mode);
+ shift = 0;
}
- val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
+ /* If subtraction is free, prefer to load a positive constant.
+ In the best case this will fit a shifted CNTB. */
+ else if (src != const0_rtx && rel_factor < 0)
+ {
+ code = MINUS;
+ val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
+ }
+ /* Otherwise use a shifted RDVL or CNT[BHWD]. */
+ else
+ val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
}
else
{
- /* Base the factor on LOW_BIT if we can calculate LOW_BIT
- directly, since that should increase the chances of being
- able to use a shift and add sequence. If LOW_BIT itself
- is out of range, just use CNTD. */
- if (low_bit <= 16 * 8)
- factor /= low_bit;
+ /* If we can calculate CNTB << SHIFT directly, prefer to do that,
+ since it should increase the chances of being able to use
+ a shift and add sequence for the multiplication.
+ If CNTB << SHIFT is out of range, stick with the current
+ shift factor. */
+ if (IN_RANGE (low_bit, 2, 16 * 16))
+ {
+ val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
+ shift = 0;
+ }
else
- low_bit = 1;
+ val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
- val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
val = aarch64_force_temporary (mode, temp1, val);
+ /* Prefer to multiply by a positive factor and subtract rather
+ than multiply by a negative factor and add, since positive
+ values are usually easier to move. */
+ if (rel_factor < 0 && src != const0_rtx)
+ {
+ rel_factor = -rel_factor;
+ code = MINUS;
+ }
+
if (can_create_pseudo_p ())
{
- rtx coeff1 = gen_int_mode (factor, mode);
+ rtx coeff1 = gen_int_mode (rel_factor, mode);
val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
}
else
{
- /* Go back to using a negative multiplication factor if we have
- no register from which to subtract. */
- if (code == MINUS && src == const0_rtx)
- {
- factor = -factor;
- code = PLUS;
- }
- rtx coeff1 = gen_int_mode (factor, mode);
+ rtx coeff1 = gen_int_mode (rel_factor, mode);
coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
val = gen_rtx_MULT (mode, val, coeff1);
}
}
+ /* Multiply by 2 ** SHIFT. */
if (shift > 0)
{
- /* Multiply by 1 << SHIFT. */
val = aarch64_force_temporary (mode, temp1, val);
val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
}
- else if (shift == -1)
+ else if (shift < 0)
{
- /* Divide by 2. */
val = aarch64_force_temporary (mode, temp1, val);
- val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
+ val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
}
- /* Calculate SRC +/- CNTD * FACTOR / 2. */
+ /* Add the result to SRC or subtract the result from SRC. */
if (src != const0_rtx)
{
val = aarch64_force_temporary (mode, temp1, val);
@@ -4508,7 +4575,9 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
aarch64_report_sve_required ();
return;
}
- if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
+ if (base == const0_rtx
+ && (aarch64_sve_cnt_immediate_p (offset)
+ || aarch64_sve_rdvl_immediate_p (offset)))
emit_insn (gen_rtx_SET (dest, imm));
else
{
@@ -19641,7 +19710,9 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
return true;
- if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
+ if (TARGET_SVE
+ && (aarch64_sve_cnt_immediate_p (x)
+ || aarch64_sve_rdvl_immediate_p (x)))
return true;
return aarch64_classify_symbolic_expression (x)
@@ -1230,6 +1230,7 @@ (define_insn "*mov<mode>_aarch64"
[w, D<hq>; neon_move , simd ] << aarch64_output_scalar_simd_mov_immediate (operands[1], <MODE>mode);
/* The "mov_imm" type for CNT is just a placeholder. */
[r, Usv ; mov_imm , sve ] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r, Usr ; mov_imm , sve ] << aarch64_output_sve_rdvl (operands[1]);
[r, m ; load_4 , * ] ldr<size>\t%w0, %1
[w, m ; load_4 , * ] ldr\t%<size>0, %1
[m, r Z ; store_4 , * ] str<size>\\t%w1, %0
@@ -1289,6 +1290,7 @@ (define_insn_and_split "*movsi_aarch64"
[r , n ; mov_imm , * ,16] #
/* The "mov_imm" type for CNT is just a placeholder. */
[r , Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r , Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
[r , m ; load_4 , * , 4] ldr\t%w0, %1
[w , m ; load_4 , fp , 4] ldr\t%s0, %1
[m , r Z; store_4 , * , 4] str\t%w1, %0
@@ -1324,6 +1326,7 @@ (define_insn_and_split "*movdi_aarch64"
[r, n ; mov_imm , * ,16] #
/* The "mov_imm" type for CNT is just a placeholder. */
[r, Usv; mov_imm , sve , 4] << aarch64_output_sve_cnt_immediate ("cnt", "%x0", operands[1]);
+ [r, Usr; mov_imm , sve, 4] << aarch64_output_sve_rdvl (operands[1]);
[r, m ; load_8 , * , 4] ldr\t%x0, %1
[w, m ; load_8 , fp , 4] ldr\t%d0, %1
[m, r Z; store_8 , * , 4] str\t%x1, %0
@@ -219,6 +219,12 @@ (define_constraint "Ulc"
(and (match_code "const_int")
(match_test "aarch64_high_bits_all_ones_p (ival)")))
+(define_constraint "Usr"
+ "@internal
+ A constraint that matches a value produced by RDVL."
+ (and (match_code "const_poly_int")
+ (match_test "aarch64_sve_rdvl_immediate_p (op)")))
+
(define_constraint "Usv"
"@internal
A constraint that matches a VG-based constant that can be loaded by
@@ -51,19 +51,24 @@ PROTO (cntb_15, uint64_t, ()) { return svcntb () * 15; }
*/
PROTO (cntb_16, uint64_t, ()) { return svcntb () * 16; }
-/* Other sequences would be OK. */
/*
** cntb_17:
-** cntb x0, all, mul #16
-** incb x0
+** rdvl x0, #17
** ret
*/
PROTO (cntb_17, uint64_t, ()) { return svcntb () * 17; }
+/*
+** cntb_31:
+** rdvl x0, #31
+** ret
+*/
+PROTO (cntb_31, uint64_t, ()) { return svcntb () * 31; }
+
/*
** cntb_32:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cntb_32, uint64_t, ()) { return svcntb () * 32; }
@@ -80,16 +85,16 @@ PROTO (cntb_33, uint64_t, ()) { return svcntb () * 33; }
/*
** cntb_64:
-** cntd (x[0-9]+)
-** lsl x0, \1, 9
+** cntb (x[0-9]+)
+** lsl x0, \1, 6
** ret
*/
PROTO (cntb_64, uint64_t, ()) { return svcntb () * 64; }
/*
** cntb_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 10
+** cntb (x[0-9]+)
+** lsl x0, \1, 7
** ret
*/
PROTO (cntb_128, uint64_t, ()) { return svcntb () * 128; }
@@ -106,46 +111,70 @@ PROTO (cntb_129, uint64_t, ()) { return svcntb () * 129; }
/*
** cntb_m1:
-** cntb (x[0-9]+)
-** neg x0, \1
+** rdvl x0, #-1
** ret
*/
PROTO (cntb_m1, uint64_t, ()) { return -svcntb (); }
/*
** cntb_m13:
-** cntb (x[0-9]+), all, mul #13
-** neg x0, \1
+** rdvl x0, #-13
** ret
*/
PROTO (cntb_m13, uint64_t, ()) { return -svcntb () * 13; }
/*
** cntb_m15:
-** cntb (x[0-9]+), all, mul #15
-** neg x0, \1
+** rdvl x0, #-15
** ret
*/
PROTO (cntb_m15, uint64_t, ()) { return -svcntb () * 15; }
/*
** cntb_m16:
-** cntb (x[0-9]+), all, mul #16
-** neg x0, \1
+** rdvl x0, #-16
** ret
*/
PROTO (cntb_m16, uint64_t, ()) { return -svcntb () * 16; }
-/* Other sequences would be OK. */
/*
** cntb_m17:
-** cntb x0, all, mul #16
-** incb x0
-** neg x0, x0
+** rdvl x0, #-17
** ret
*/
PROTO (cntb_m17, uint64_t, ()) { return -svcntb () * 17; }
+/*
+** cntb_m32:
+** rdvl x0, #-32
+** ret
+*/
+PROTO (cntb_m32, uint64_t, ()) { return -svcntb () * 32; }
+
+/*
+** cntb_m33:
+** rdvl x0, #-32
+** decb x0
+** ret
+*/
+PROTO (cntb_m33, uint64_t, ()) { return -svcntb () * 33; }
+
+/*
+** cntb_m34:
+** rdvl (x[0-9]+), #-17
+** lsl x0, \1, #?1
+** ret
+*/
+PROTO (cntb_m34, uint64_t, ()) { return -svcntb () * 34; }
+
+/*
+** cntb_m64:
+** rdvl (x[0-9]+), #-1
+** lsl x0, \1, #?6
+** ret
+*/
+PROTO (cntb_m64, uint64_t, ()) { return -svcntb () * 64; }
+
/*
** incb_1:
** incb x0
@@ -54,8 +54,8 @@ PROTO (cntd_16, uint64_t, ()) { return svcntd () * 16; }
/* Other sequences would be OK. */
/*
** cntd_17:
-** cntb x0, all, mul #2
-** incd x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 3
** ret
*/
PROTO (cntd_17, uint64_t, ()) { return svcntd () * 17; }
@@ -107,8 +107,7 @@ PROTO (cntd_m15, uint64_t, ()) { return -svcntd () * 15; }
/*
** cntd_m16:
-** cntb (x[0-9]+), all, mul #2
-** neg x0, \1
+** rdvl x0, #-2
** ret
*/
PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
@@ -116,9 +115,8 @@ PROTO (cntd_m16, uint64_t, ()) { return -svcntd () * 16; }
/* Other sequences would be OK. */
/*
** cntd_m17:
-** cntb x0, all, mul #2
-** incd x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 3
** ret
*/
PROTO (cntd_m17, uint64_t, ()) { return -svcntd () * 17; }
@@ -54,8 +54,8 @@ PROTO (cnth_16, uint64_t, ()) { return svcnth () * 16; }
/* Other sequences would be OK. */
/*
** cnth_17:
-** cntb x0, all, mul #8
-** inch x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 1
** ret
*/
PROTO (cnth_17, uint64_t, ()) { return svcnth () * 17; }
@@ -69,16 +69,16 @@ PROTO (cnth_32, uint64_t, ()) { return svcnth () * 32; }
/*
** cnth_64:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cnth_64, uint64_t, ()) { return svcnth () * 64; }
/*
** cnth_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 9
+** cntb (x[0-9]+)
+** lsl x0, \1, 6
** ret
*/
PROTO (cnth_128, uint64_t, ()) { return svcnth () * 128; }
@@ -109,8 +109,7 @@ PROTO (cnth_m15, uint64_t, ()) { return -svcnth () * 15; }
/*
** cnth_m16:
-** cntb (x[0-9]+), all, mul #8
-** neg x0, \1
+** rdvl x0, #-8
** ret
*/
PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
@@ -118,9 +117,8 @@ PROTO (cnth_m16, uint64_t, ()) { return -svcnth () * 16; }
/* Other sequences would be OK. */
/*
** cnth_m17:
-** cntb x0, all, mul #8
-** inch x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 1
** ret
*/
PROTO (cnth_m17, uint64_t, ()) { return -svcnth () * 17; }
@@ -54,8 +54,8 @@ PROTO (cntw_16, uint64_t, ()) { return svcntw () * 16; }
/* Other sequences would be OK. */
/*
** cntw_17:
-** cntb x0, all, mul #4
-** incw x0
+** rdvl (x[0-9]+), #17
+** asr x0, \1, 2
** ret
*/
PROTO (cntw_17, uint64_t, ()) { return svcntw () * 17; }
@@ -76,8 +76,8 @@ PROTO (cntw_64, uint64_t, ()) { return svcntw () * 64; }
/*
** cntw_128:
-** cntd (x[0-9]+)
-** lsl x0, \1, 8
+** cntb (x[0-9]+)
+** lsl x0, \1, 5
** ret
*/
PROTO (cntw_128, uint64_t, ()) { return svcntw () * 128; }
@@ -108,8 +108,7 @@ PROTO (cntw_m15, uint64_t, ()) { return -svcntw () * 15; }
/*
** cntw_m16:
-** cntb (x[0-9]+), all, mul #4
-** neg x0, \1
+** rdvl (x[0-9]+), #-4
** ret
*/
PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
@@ -117,9 +116,8 @@ PROTO (cntw_m16, uint64_t, ()) { return -svcntw () * 16; }
/* Other sequences would be OK. */
/*
** cntw_m17:
-** cntb x0, all, mul #4
-** incw x0
-** neg x0, x0
+** rdvl (x[0-9]+), #-17
+** asr x0, \1, 2
** ret
*/
PROTO (cntw_m17, uint64_t, ()) { return -svcntw () * 17; }
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfb_vnum_31, uint16_t,
/*
** prfb_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfb pldl1keep, p0, \[\3\]
** ret
@@ -240,7 +240,7 @@ TEST_PREFETCH (prfb_vnum_m32, uint16_t,
/*
** prfb_vnum_m33:
** ...
-** prfb pldl1keep, p0, \[x[0-9]+\]
+** prfb pldl1keep, p0, \[x[0-9]+(, x[0-9]+)?\]
** ret
*/
TEST_PREFETCH (prfb_vnum_m33, uint16_t,
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfd_vnum_31, uint16_t,
/*
** prfd_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfd pldl1keep, p0, \[\3\]
** ret
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfh_vnum_31, uint16_t,
/*
** prfh_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfh pldl1keep, p0, \[\3\]
** ret
@@ -218,8 +218,8 @@ TEST_PREFETCH (prfw_vnum_31, uint16_t,
/*
** prfw_vnum_32:
-** cntd (x[0-9]+)
-** lsl (x[0-9]+), \1, #?8
+** cntb (x[0-9]+)
+** lsl (x[0-9]+), \1, #?5
** add (x[0-9]+), (\2, x0|x0, \2)
** prfw pldl1keep, p0, \[\3\]
** ret
@@ -68,8 +68,7 @@ TEST_ALL (LOOP)
/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, w[0-9]+, w[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 2\]} 8 } } */
-/* 2 for the calculations of -17 and 17. */
-/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincw\tx[0-9]+\n} 8 } } */
/* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #16\n} 1 } } */
/* { dg-final { scan-assembler-times {\tdecw\tz[0-9]+\.s, all, mul #15\n} 1 } } */
@@ -86,8 +85,7 @@ TEST_ALL (LOOP)
/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, x[0-9]+, x[0-9]+\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]+/z, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7]+, \[x[0-9]+, x[0-9]+, lsl 3\]} 8 } } */
-/* 2 for the calculations of -17 and 17. */
-/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 10 } } */
+/* { dg-final { scan-assembler-times {\tincd\tx[0-9]+\n} 8 } } */
/* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #16\n} 1 } } */
/* { dg-final { scan-assembler-times {\tdecd\tz[0-9]+\.d, all, mul #15\n} 1 } } */
@@ -6,8 +6,7 @@
/*
** test_1:
-** cntd x12, all, mul #9
-** lsl x12, x12, #?4
+** rdvl x12, #18
** mov x11, sp
** ...
** sub sp, sp, x12