Message ID | Y/yjTE9I4WrC4tEg@arm.com |
---|---|
State | New |
Headers | show |
Series | [1/4] middle-end: Revert can_special_div_by_const changes [PR108583] | expand |
Ping, And updating the hook. There are no new test as new correctness tests were added to the mid-end and the existing codegen tests for this already exist. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR target/108583 * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): Remove. (*bitmask_shift_plus<mode>): New. * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New. (@aarch64_bitmask_udiv<mode>3): Remove. * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant, TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed. (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT, aarch64_vectorize_preferred_div_as_shifts_over_mult): New. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7f212bf37cd2c120dceb7efa733c9fa76226f029..e1ecb88634f93d380ef534093ea6599dc7278108 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4867,60 +4867,27 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>" } ) -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; If we imagine a short as being composed of two blocks of bytes then -;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to -;; adding 1 to each sub component: -;; -;; short value of 16-bits -;; ┌──────────────┬────────────────┐ -;; │ │ │ -;; └──────────────┴────────────────┘ -;; 8-bit part1 ▲ 8-bit part2 ▲ -;; │ │ -;; │ │ -;; +1 +1 -;; -;; after the first addition, we have to shift right by 8, and narrow the -;; results back to a byte. Remember that the addition must be done in -;; double the precision of the input. Since 8 is half the size of a short -;; we can use a narrowing halfing instruction in AArch64, addhn which also -;; does the addition in a wider precision and narrows back to a byte. The -;; shift itself is implicit in the operation as it writes back only the top -;; half of the result. i.e. bits 2*esize-1:esize. -;; -;; Since we have narrowed the result of the first part back to a byte, for -;; the second addition we can use a widening addition, uaddw. -;; -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. -;; -;; The shift is later optimized by combine to a uzp2 with movi #0. -(define_expand "@aarch64_bitmask_udiv<mode>3" - [(match_operand:VQN 0 "register_operand") - (match_operand:VQN 1 "register_operand") - (match_operand:VQN 2 "immediate_operand")] +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector +(define_insn_and_split "*bitmask_shift_plus<mode>" + [(set (match_operand:VQN 0 "register_operand" "=&w") + (plus:VQN + (lshiftrt:VQN + (plus:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")) + (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" "Dr")) + (match_operand:VQN 4 "register_operand" "w")))] "TARGET_SIMD" + "#" + "&& true" + [(const_int 0)] { - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; - - rtx addend = gen_reg_rtx (<MODE>mode); - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); - rtx tmp2 = gen_reg_rtx (<MODE>mode); - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); + rtx tmp; + if (can_create_pseudo_p ()) + tmp = gen_reg_rtx (<VNARROWQ>mode); + else + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2])); + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp)); DONE; }) diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d6873877386222d56144cc115e3953a61 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2317,41 +2317,24 @@ (define_insn "@aarch64_sve_<optab><mode>" ;; ---- [INT] Misc optab implementations ;; ------------------------------------------------------------------------- ;; Includes: -;; - aarch64_bitmask_udiv +;; - bitmask_shift_plus ;; ------------------------------------------------------------------------- -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; See aarch64-simd.md for bigger explanation. -(define_expand "@aarch64_bitmask_udiv<mode>3" - [(match_operand:SVE_FULL_HSDI 0 "register_operand") - (match_operand:SVE_FULL_HSDI 1 "register_operand") - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector +(define_insn "*bitmask_shift_plus<mode>" + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") + (unspec:SVE_FULL_HSDI + [(match_operand:<VPRED> 1) + (lshiftrt:SVE_FULL_HSDI + (plus:SVE_FULL_HSDI + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) + (match_operand:SVE_FULL_HSDI 4 + "aarch64_simd_shift_imm_vec_exact_top" "Dr"))] + UNSPEC_PRED_X))] "TARGET_SVE2" -{ - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; - - rtx addend = gen_reg_rtx (<MODE>mode); - rtx tmp1 = gen_reg_rtx (<VNARROW>mode); - rtx tmp2 = gen_reg_rtx (<VNARROW>mode); - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1); - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1], - addend)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1], - lowpart_subreg (<MODE>mode, tmp1, - <VNARROW>mode))); - emit_move_insn (operands[0], - lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode)); - DONE; -}) + "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" +) ;; ========================================================================= ;; == Permutation diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..eb4f99ee524844ed5b3684c6fe807a4128685423 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3849,6 +3849,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode, return default_vectorize_related_mode (vector_mode, element_mode, nunits); } +/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ + +static bool +aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type) +{ + machine_mode mode = TYPE_MODE (type); + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + bool sve_p = (vec_flags & VEC_ANY_SVE); + bool simd_p = (vec_flags & VEC_ADVSIMD); + + return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD); +} + /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, prefer to use the first arithmetic operand as the else value if the else value doesn't matter, since that exactly matches the SVE @@ -24363,46 +24376,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } - -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ - -bool -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, - tree vectype, wide_int cst, - rtx *output, rtx in0, rtx in1) -{ - if (code != TRUNC_DIV_EXPR - || !TYPE_UNSIGNED (vectype)) - return false; - - machine_mode mode = TYPE_MODE (vectype); - unsigned int flags = aarch64_classify_vector_mode (mode); - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) - return false; - - int pow = wi::exact_log2 (cst + 1); - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype)); - /* SVE actually has a div operator, we may have gotten here through - that route. */ - if (pow != (int) (element_precision (vectype) / 2) - || insn_code == CODE_FOR_nothing) - return false; - - /* We can use the optimized pattern. */ - if (in0 == NULL_RTX && in1 == NULL_RTX) - return true; - - gcc_assert (output); - - expand_operand ops[3]; - create_output_operand (&ops[0], *output, mode); - create_input_operand (&ops[1], in0, mode); - create_fixed_operand (&ops[2], in1); - expand_insn (insn_code, 3, ops); - *output = ops[0].value; - return true; -} - /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ @@ -27904,13 +27877,13 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_MAX_ANCHOR_OFFSET #define TARGET_MAX_ANCHOR_OFFSET 4095 +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \ + aarch64_vectorize_preferred_div_as_shifts_over_mult + #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ - aarch64_vectorize_can_special_div_by_constant - #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ aarch64_vectorize_preferred_vector_alignment
Tamar Christina <Tamar.Christina@arm.com> writes: > Ping, > > And updating the hook. > > There are no new test as new correctness tests were added to the mid-end and > the existing codegen tests for this already exist. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR target/108583 > * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): Remove. > (*bitmask_shift_plus<mode>): New. > * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New. > (@aarch64_bitmask_udiv<mode>3): Remove. > * config/aarch64/aarch64.cc > (aarch64_vectorize_can_special_div_by_constant, > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed. > (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT, > aarch64_vectorize_preferred_div_as_shifts_over_mult): New. > > --- inline copy of patch --- > > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index 7f212bf37cd2c120dceb7efa733c9fa76226f029..e1ecb88634f93d380ef534093ea6599dc7278108 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4867,60 +4867,27 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>" > } > ) > > -;; div optimizations using narrowings > -;; we can do the division e.g. shorts by 255 faster by calculating it as > -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in > -;; double the precision of x. > -;; > -;; If we imagine a short as being composed of two blocks of bytes then > -;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to > -;; adding 1 to each sub component: > -;; > -;; short value of 16-bits > -;; ┌──────────────┬────────────────┐ > -;; │ │ │ > -;; └──────────────┴────────────────┘ > -;; 8-bit part1 ▲ 8-bit part2 ▲ > -;; │ │ > -;; │ │ > -;; +1 +1 > -;; > -;; after the first addition, we have to shift right by 8, and narrow the > -;; results back to a byte. Remember that the addition must be done in > -;; double the precision of the input. Since 8 is half the size of a short > -;; we can use a narrowing halfing instruction in AArch64, addhn which also > -;; does the addition in a wider precision and narrows back to a byte. The > -;; shift itself is implicit in the operation as it writes back only the top > -;; half of the result. i.e. bits 2*esize-1:esize. > -;; > -;; Since we have narrowed the result of the first part back to a byte, for > -;; the second addition we can use a widening addition, uaddw. > -;; > -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. > -;; > -;; The shift is later optimized by combine to a uzp2 with movi #0. > -(define_expand "@aarch64_bitmask_udiv<mode>3" > - [(match_operand:VQN 0 "register_operand") > - (match_operand:VQN 1 "register_operand") > - (match_operand:VQN 2 "immediate_operand")] > +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector > +(define_insn_and_split "*bitmask_shift_plus<mode>" > + [(set (match_operand:VQN 0 "register_operand" "=&w") > + (plus:VQN > + (lshiftrt:VQN > + (plus:VQN (match_operand:VQN 1 "register_operand" "w") > + (match_operand:VQN 2 "register_operand" "w")) > + (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" "Dr")) I guess this is personal preference, sorry, but I think we should drop the constraint. The predicate does the real check, and the operand is never reloaded, so "Dr" isn't any more helpful than an empty constraint, and IMO can be confusing. > + (match_operand:VQN 4 "register_operand" "w")))] > "TARGET_SIMD" > + "#" > + "&& true" > + [(const_int 0)] > { > - unsigned HOST_WIDE_INT size > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > - FAIL; > - > - rtx addend = gen_reg_rtx (<MODE>mode); > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); > - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > - rtx tmp2 = gen_reg_rtx (<MODE>mode); > - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); > - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); > + rtx tmp; > + if (can_create_pseudo_p ()) > + tmp = gen_reg_rtx (<VNARROWQ>mode); > + else > + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); > + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2])); > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp)); > DONE; > }) In the previous review, I said: However, IIUC, this pattern would only be formed from combining three distinct patterns. Is that right? If so, we should be able to handle it as a plain define_split, with no define_insn. That should make things simpler, so would be worth trying before the changes I mentioned above. Did you try that? I still think it'd be preferable to defining a new insn. > diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md > index 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d6873877386222d56144cc115e3953a61 100644 > --- a/gcc/config/aarch64/aarch64-sve2.md > +++ b/gcc/config/aarch64/aarch64-sve2.md > @@ -2317,41 +2317,24 @@ (define_insn "@aarch64_sve_<optab><mode>" > ;; ---- [INT] Misc optab implementations > ;; ------------------------------------------------------------------------- > ;; Includes: > -;; - aarch64_bitmask_udiv > +;; - bitmask_shift_plus This is no longer an optab. The original purpose of the "Includes:" comments was to list the ISA instructions that are actually being generated, as a short-cut to working through all the abstractions. Just listing define_insn names doesn't really add anything over reading the insns themselves. Since the new pattern is an alternative way of generating ADDHNB, it probably belongs in the "Narrowing binary arithmetic" section. > ;; ------------------------------------------------------------------------- > > -;; div optimizations using narrowings > -;; we can do the division e.g. shorts by 255 faster by calculating it as > -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in > -;; double the precision of x. > -;; > -;; See aarch64-simd.md for bigger explanation. > -(define_expand "@aarch64_bitmask_udiv<mode>3" > - [(match_operand:SVE_FULL_HSDI 0 "register_operand") > - (match_operand:SVE_FULL_HSDI 1 "register_operand") > - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] > +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector > +(define_insn "*bitmask_shift_plus<mode>" > + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") > + (unspec:SVE_FULL_HSDI > + [(match_operand:<VPRED> 1) > + (lshiftrt:SVE_FULL_HSDI > + (plus:SVE_FULL_HSDI > + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") > + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) > + (match_operand:SVE_FULL_HSDI 4 > + "aarch64_simd_shift_imm_vec_exact_top" "Dr"))] Same comment about the constraints here. > + UNSPEC_PRED_X))] > "TARGET_SVE2" > -{ > - unsigned HOST_WIDE_INT size > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1; > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > - FAIL; > - > - rtx addend = gen_reg_rtx (<MODE>mode); > - rtx tmp1 = gen_reg_rtx (<VNARROW>mode); > - rtx tmp2 = gen_reg_rtx (<VNARROW>mode); > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1); > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode)); > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1], > - addend)); > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1], > - lowpart_subreg (<MODE>mode, tmp1, > - <VNARROW>mode))); > - emit_move_insn (operands[0], > - lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode)); > - DONE; > -}) > + "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" > +) The pattern LGTM otherwise. > ;; ========================================================================= > ;; == Permutation > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..eb4f99ee524844ed5b3684c6fe807a4128685423 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -3849,6 +3849,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode, > return default_vectorize_related_mode (vector_mode, element_mode, nunits); > } > > +/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ > + > +static bool > +aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type) > +{ > + machine_mode mode = TYPE_MODE (type); > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > + bool sve_p = (vec_flags & VEC_ANY_SVE); > + bool simd_p = (vec_flags & VEC_ADVSIMD); > + > + return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD); > +} > + And the hook LGTM too. Thanks, Richard > /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, > prefer to use the first arithmetic operand as the else value if > the else value doesn't matter, since that exactly matches the SVE > @@ -24363,46 +24376,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, > > return ret; > } > - > -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > - > -bool > -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > - tree vectype, wide_int cst, > - rtx *output, rtx in0, rtx in1) > -{ > - if (code != TRUNC_DIV_EXPR > - || !TYPE_UNSIGNED (vectype)) > - return false; > - > - machine_mode mode = TYPE_MODE (vectype); > - unsigned int flags = aarch64_classify_vector_mode (mode); > - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > - return false; > - > - int pow = wi::exact_log2 (cst + 1); > - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype)); > - /* SVE actually has a div operator, we may have gotten here through > - that route. */ > - if (pow != (int) (element_precision (vectype) / 2) > - || insn_code == CODE_FOR_nothing) > - return false; > - > - /* We can use the optimized pattern. */ > - if (in0 == NULL_RTX && in1 == NULL_RTX) > - return true; > - > - gcc_assert (output); > - > - expand_operand ops[3]; > - create_output_operand (&ops[0], *output, mode); > - create_input_operand (&ops[1], in0, mode); > - create_fixed_operand (&ops[2], in1); > - expand_insn (insn_code, 3, ops); > - *output = ops[0].value; > - return true; > -} > - > /* Generate a byte permute mask for a register of mode MODE, > which has NUNITS units. */ > > @@ -27904,13 +27877,13 @@ aarch64_libgcc_floating_mode_supported_p > #undef TARGET_MAX_ANCHOR_OFFSET > #define TARGET_MAX_ANCHOR_OFFSET 4095 > > +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT > +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \ > + aarch64_vectorize_preferred_div_as_shifts_over_mult > + > #undef TARGET_VECTOR_ALIGNMENT > #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment > > -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ > - aarch64_vectorize_can_special_div_by_constant > - > #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > aarch64_vectorize_preferred_vector_alignment
> -----Original Message----- > From: Richard Sandiford <richard.sandiford@arm.com> > Sent: Wednesday, March 8, 2023 9:18 AM > To: Tamar Christina <Tamar.Christina@arm.com> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw > <Richard.Earnshaw@arm.com>; Marcus Shawcroft > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> > Subject: Re: [PATCH 4/4]AArch64 Update div-bitmask to implement new > optab instead of target hook [PR108583] > > Tamar Christina <Tamar.Christina@arm.com> writes: > > Ping, > > > > And updating the hook. > > > > There are no new test as new correctness tests were added to the > > mid-end and the existing codegen tests for this already exist. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR target/108583 > > * config/aarch64/aarch64-simd.md > (@aarch64_bitmask_udiv<mode>3): Remove. > > (*bitmask_shift_plus<mode>): New. > > * config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New. > > (@aarch64_bitmask_udiv<mode>3): Remove. > > * config/aarch64/aarch64.cc > > (aarch64_vectorize_can_special_div_by_constant, > > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed. > > (TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT, > > aarch64_vectorize_preferred_div_as_shifts_over_mult): New. > > > > --- inline copy of patch --- > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 7f212bf37cd2c120dceb7efa733c9fa76226f029..e1ecb88634f93d380ef534 > 093ea6 > > 599dc7278108 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -4867,60 +4867,27 @@ (define_expand > "aarch64_<sur><addsub>hn2<mode>" > > } > > ) > > > > -;; div optimizations using narrowings -;; we can do the division e.g. > > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) > > >> 8 assuming the operation is done in -;; double the precision of x. > > -;; > > -;; If we imagine a short as being composed of two blocks of bytes > > then -;; adding 257 or 0b0000_0001_0000_0001 to the number is > > equivalent to -;; adding 1 to each sub component: > > -;; > > -;; short value of 16-bits > > -;; ┌──────────────┬────────────────┐ > > -;; │ │ │ > > -;; └──────────────┴────────────────┘ > > -;; 8-bit part1 ▲ 8-bit part2 ▲ > > -;; │ │ > > -;; │ │ > > -;; +1 +1 > > -;; > > -;; after the first addition, we have to shift right by 8, and narrow > > the -;; results back to a byte. Remember that the addition must be > > done in -;; double the precision of the input. Since 8 is half the > > size of a short -;; we can use a narrowing halfing instruction in > > AArch64, addhn which also -;; does the addition in a wider precision > > and narrows back to a byte. The -;; shift itself is implicit in the > > operation as it writes back only the top -;; half of the result. i.e. bits 2*esize- > 1:esize. > > -;; > > -;; Since we have narrowed the result of the first part back to a > > byte, for -;; the second addition we can use a widening addition, uaddw. > > -;; > > -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. > > -;; > > -;; The shift is later optimized by combine to a uzp2 with movi #0. > > -(define_expand "@aarch64_bitmask_udiv<mode>3" > > - [(match_operand:VQN 0 "register_operand") > > - (match_operand:VQN 1 "register_operand") > > - (match_operand:VQN 2 "immediate_operand")] > > +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the > > +vector (define_insn_and_split "*bitmask_shift_plus<mode>" > > + [(set (match_operand:VQN 0 "register_operand" "=&w") > > + (plus:VQN > > + (lshiftrt:VQN > > + (plus:VQN (match_operand:VQN 1 "register_operand" "w") > > + (match_operand:VQN 2 "register_operand" "w")) > > + (match_operand:VQN 3 > > +"aarch64_simd_shift_imm_vec_exact_top" "Dr")) > > I guess this is personal preference, sorry, but I think we should drop the > constraint. The predicate does the real check, and the operand is never > reloaded, so "Dr" isn't any more helpful than an empty constraint, and IMO > can be confusing. > > > + (match_operand:VQN 4 "register_operand" "w")))] > > "TARGET_SIMD" > > + "#" > > + "&& true" > > + [(const_int 0)] > > { > > - unsigned HOST_WIDE_INT size > > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > > - FAIL; > > - > > - rtx addend = gen_reg_rtx (<MODE>mode); > > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > <VNARROWQ2>mode)); > > - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > > - rtx tmp2 = gen_reg_rtx (<MODE>mode); > > - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > > - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > > - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > > bitsize); > > - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > > - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > > shift_vector)); > > + rtx tmp; > > + if (can_create_pseudo_p ()) > > + tmp = gen_reg_rtx (<VNARROWQ>mode); else > > + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); > > + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], > operands[2])); > > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], > > + tmp)); > > DONE; > > }) > > In the previous review, I said: > > However, IIUC, this pattern would only be formed from combining > three distinct patterns. Is that right? If so, we should be able > to handle it as a plain define_split, with no define_insn. > That should make things simpler, so would be worth trying before > the changes I mentioned above. > > Did you try that? I still think it'd be preferable to defining a new insn. Yes I did! Sorry I forgot to mention that. When I made it a split for some reason It wasn't matching it anymore. Regards, Tamar > > > diff --git a/gcc/config/aarch64/aarch64-sve2.md > > b/gcc/config/aarch64/aarch64-sve2.md > > index > > > 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d687387738622 > 2d56144c > > c115e3953a61 100644 > > --- a/gcc/config/aarch64/aarch64-sve2.md > > +++ b/gcc/config/aarch64/aarch64-sve2.md > > @@ -2317,41 +2317,24 @@ (define_insn > "@aarch64_sve_<optab><mode>" > > ;; ---- [INT] Misc optab implementations ;; > > ---------------------------------------------------------------------- > > --- > > ;; Includes: > > -;; - aarch64_bitmask_udiv > > +;; - bitmask_shift_plus > > This is no longer an optab. > > The original purpose of the "Includes:" comments was to list the ISA > instructions that are actually being generated, as a short-cut to working > through all the abstractions. Just listing define_insn names doesn't really add > anything over reading the insns themselves. > > Since the new pattern is an alternative way of generating ADDHNB, it probably > belongs in the "Narrowing binary arithmetic" section. > > > ;; > > ---------------------------------------------------------------------- > > --- > > > > -;; div optimizations using narrowings -;; we can do the division e.g. > > shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) > > >> 8 assuming the operation is done in -;; double the precision of x. > > -;; > > -;; See aarch64-simd.md for bigger explanation. > > -(define_expand "@aarch64_bitmask_udiv<mode>3" > > - [(match_operand:SVE_FULL_HSDI 0 "register_operand") > > - (match_operand:SVE_FULL_HSDI 1 "register_operand") > > - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] > > +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector > > +(define_insn "*bitmask_shift_plus<mode>" > > + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") > > + (unspec:SVE_FULL_HSDI > > + [(match_operand:<VPRED> 1) > > + (lshiftrt:SVE_FULL_HSDI > > + (plus:SVE_FULL_HSDI > > + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") > > + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) > > + (match_operand:SVE_FULL_HSDI 4 > > + "aarch64_simd_shift_imm_vec_exact_top" "Dr"))] > > Same comment about the constraints here. > > > + UNSPEC_PRED_X))] > > "TARGET_SVE2" > > -{ > > - unsigned HOST_WIDE_INT size > > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1; > > - rtx elt = unwrap_const_vec_duplicate (operands[2]); > > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) > > - FAIL; > > - > > - rtx addend = gen_reg_rtx (<MODE>mode); > > - rtx tmp1 = gen_reg_rtx (<VNARROW>mode); > > - rtx tmp2 = gen_reg_rtx (<VNARROW>mode); > > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1); > > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > <VNARROW>mode)); > > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, > operands[1], > > - addend)); > > - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, > operands[1], > > - lowpart_subreg (<MODE>mode, tmp1, > > - <VNARROW>mode))); > > - emit_move_insn (operands[0], > > - lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode)); > > - DONE; > > -}) > > + "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" > > +) > > The pattern LGTM otherwise. > > > ;; > > > =================================================================== > === > > === > > ;; == Permutation > > diff --git a/gcc/config/aarch64/aarch64.cc > > b/gcc/config/aarch64/aarch64.cc index > > > e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..eb4f99ee524844ed5b36 > 84c6fe80 > > 7a4128685423 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -3849,6 +3849,19 @@ aarch64_vectorize_related_mode > (machine_mode vector_mode, > > return default_vectorize_related_mode (vector_mode, element_mode, > > nunits); } > > > > +/* Implement > TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ > > + > > +static bool > > +aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type) > > +{ > > + machine_mode mode = TYPE_MODE (type); > > + unsigned int vec_flags = aarch64_classify_vector_mode (mode); > > + bool sve_p = (vec_flags & VEC_ANY_SVE); > > + bool simd_p = (vec_flags & VEC_ADVSIMD); > > + > > + return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD); } > > + > > And the hook LGTM too. > > Thanks, > Richard > > > /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, > > prefer to use the first arithmetic operand as the else value if > > the else value doesn't matter, since that exactly matches the SVE > > @@ -24363,46 +24376,6 @@ aarch64_vectorize_vec_perm_const > > (machine_mode vmode, machine_mode op_mode, > > > > return ret; > > } > > - > > -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > > - > > -bool > > -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > > - tree vectype, wide_int cst, > > - rtx *output, rtx in0, rtx in1) > > -{ > > - if (code != TRUNC_DIV_EXPR > > - || !TYPE_UNSIGNED (vectype)) > > - return false; > > - > > - machine_mode mode = TYPE_MODE (vectype); > > - unsigned int flags = aarch64_classify_vector_mode (mode); > > - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > > - return false; > > - > > - int pow = wi::exact_log2 (cst + 1); > > - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE > > (vectype)); > > - /* SVE actually has a div operator, we may have gotten here through > > - that route. */ > > - if (pow != (int) (element_precision (vectype) / 2) > > - || insn_code == CODE_FOR_nothing) > > - return false; > > - > > - /* We can use the optimized pattern. */ > > - if (in0 == NULL_RTX && in1 == NULL_RTX) > > - return true; > > - > > - gcc_assert (output); > > - > > - expand_operand ops[3]; > > - create_output_operand (&ops[0], *output, mode); > > - create_input_operand (&ops[1], in0, mode); > > - create_fixed_operand (&ops[2], in1); > > - expand_insn (insn_code, 3, ops); > > - *output = ops[0].value; > > - return true; > > -} > > - > > /* Generate a byte permute mask for a register of mode MODE, > > which has NUNITS units. */ > > > > @@ -27904,13 +27877,13 @@ > aarch64_libgcc_floating_mode_supported_p > > #undef TARGET_MAX_ANCHOR_OFFSET > > #define TARGET_MAX_ANCHOR_OFFSET 4095 > > > > +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT > > +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \ > > + aarch64_vectorize_preferred_div_as_shifts_over_mult > > + > > #undef TARGET_VECTOR_ALIGNMENT > > #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment > > > > -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ > > - aarch64_vectorize_can_special_div_by_constant > > - > > #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > > #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > > aarch64_vectorize_preferred_vector_alignment
Tamar Christina <Tamar.Christina@arm.com> writes: >> -----Original Message----- >> > + (match_operand:VQN 4 "register_operand" "w")))] >> > "TARGET_SIMD" >> > + "#" >> > + "&& true" >> > + [(const_int 0)] >> > { >> > - unsigned HOST_WIDE_INT size >> > - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; >> > - rtx elt = unwrap_const_vec_duplicate (operands[2]); >> > - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) >> > - FAIL; >> > - >> > - rtx addend = gen_reg_rtx (<MODE>mode); >> > - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); >> > - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, >> > <VNARROWQ2>mode)); >> > - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); >> > - rtx tmp2 = gen_reg_rtx (<MODE>mode); >> > - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); >> > - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); >> > - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, >> > bitsize); >> > - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); >> > - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, >> > shift_vector)); >> > + rtx tmp; >> > + if (can_create_pseudo_p ()) >> > + tmp = gen_reg_rtx (<VNARROWQ>mode); else >> > + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); >> > + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], >> operands[2])); >> > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], >> > + tmp)); >> > DONE; >> > }) >> >> In the previous review, I said: >> >> However, IIUC, this pattern would only be formed from combining >> three distinct patterns. Is that right? If so, we should be able >> to handle it as a plain define_split, with no define_insn. >> That should make things simpler, so would be worth trying before >> the changes I mentioned above. >> >> Did you try that? I still think it'd be preferable to defining a new insn. > > Yes I did! Sorry I forgot to mention that. When I made it a split for some > reason It wasn't matching it anymore. I was hoping for a bit more detail than that :-) But it seems that the reason is that we match SRA first, so the final combination is a 2-to-1 rather than 3-to-1. So yeah, the patch is OK with the other changes mentioned in the review. Thanks, Richard
--- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4867,60 +4867,27 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>" } ) -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; If we imagine a short as being composed of two blocks of bytes then -;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to -;; adding 1 to each sub component: -;; -;; short value of 16-bits -;; ┌──────────────┬────────────────┐ -;; │ │ │ -;; └──────────────┴────────────────┘ -;; 8-bit part1 ▲ 8-bit part2 ▲ -;; │ │ -;; │ │ -;; +1 +1 -;; -;; after the first addition, we have to shift right by 8, and narrow the -;; results back to a byte. Remember that the addition must be done in -;; double the precision of the input. Since 8 is half the size of a short -;; we can use a narrowing halfing instruction in AArch64, addhn which also -;; does the addition in a wider precision and narrows back to a byte. The -;; shift itself is implicit in the operation as it writes back only the top -;; half of the result. i.e. bits 2*esize-1:esize. -;; -;; Since we have narrowed the result of the first part back to a byte, for -;; the second addition we can use a widening addition, uaddw. -;; -;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8. -;; -;; The shift is later optimized by combine to a uzp2 with movi #0. -(define_expand "@aarch64_bitmask_udiv<mode>3" - [(match_operand:VQN 0 "register_operand") - (match_operand:VQN 1 "register_operand") - (match_operand:VQN 2 "immediate_operand")] +;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector +(define_insn_and_split "*bitmask_shift_plus<mode>" + [(set (match_operand:VQN 0 "register_operand" "=&w") + (plus:VQN + (lshiftrt:VQN + (plus:VQN (match_operand:VQN 1 "register_operand" "w") + (match_operand:VQN 2 "register_operand" "w")) + (match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" "Dr")) + (match_operand:VQN 4 "register_operand" "w")))] "TARGET_SIMD" + "#" + "&& true" + [(const_int 0)] { - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; - - rtx addend = gen_reg_rtx (<MODE>mode); - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); - rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); - rtx tmp2 = gen_reg_rtx (<MODE>mode); - emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); - unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); - rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); - emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); - emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); + rtx tmp; + if (can_create_pseudo_p ()) + tmp = gen_reg_rtx (<VNARROWQ>mode); + else + tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0])); + emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2])); + emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp)); DONE; }) diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md index 40c0728a7e6f00c395c360ce7625bc2e4a018809..bed44d7d6873877386222d56144cc115e3953a61 100644 --- a/gcc/config/aarch64/aarch64-sve2.md +++ b/gcc/config/aarch64/aarch64-sve2.md @@ -2317,41 +2317,24 @@ (define_insn "@aarch64_sve_<optab><mode>" ;; ---- [INT] Misc optab implementations ;; ------------------------------------------------------------------------- ;; Includes: -;; - aarch64_bitmask_udiv +;; - bitmask_shift_plus ;; ------------------------------------------------------------------------- -;; div optimizations using narrowings -;; we can do the division e.g. shorts by 255 faster by calculating it as -;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in -;; double the precision of x. -;; -;; See aarch64-simd.md for bigger explanation. -(define_expand "@aarch64_bitmask_udiv<mode>3" - [(match_operand:SVE_FULL_HSDI 0 "register_operand") - (match_operand:SVE_FULL_HSDI 1 "register_operand") - (match_operand:SVE_FULL_HSDI 2 "immediate_operand")] +;; Optimize ((a + b) >> n) where n is half the bitsize of the vector +(define_insn "*bitmask_shift_plus<mode>" + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") + (unspec:SVE_FULL_HSDI + [(match_operand:<VPRED> 1) + (lshiftrt:SVE_FULL_HSDI + (plus:SVE_FULL_HSDI + (match_operand:SVE_FULL_HSDI 2 "register_operand" "w") + (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")) + (match_operand:SVE_FULL_HSDI 4 + "aarch64_simd_shift_imm_vec_exact_top" "Dr"))] + UNSPEC_PRED_X))] "TARGET_SVE2" -{ - unsigned HOST_WIDE_INT size - = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1; - rtx elt = unwrap_const_vec_duplicate (operands[2]); - if (!CONST_INT_P (elt) || UINTVAL (elt) != size) - FAIL; - - rtx addend = gen_reg_rtx (<MODE>mode); - rtx tmp1 = gen_reg_rtx (<VNARROW>mode); - rtx tmp2 = gen_reg_rtx (<VNARROW>mode); - rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1); - emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1], - addend)); - emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1], - lowpart_subreg (<MODE>mode, tmp1, - <VNARROW>mode))); - emit_move_insn (operands[0], - lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode)); - DONE; -}) + "addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>" +) ;; ========================================================================= ;; == Permutation diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index e6f47cbbb0d04a6f33b9a741ebb614cabd0204b9..2728fb347c0df1756b237f4d6268908eef6bdd2a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3849,6 +3849,13 @@ aarch64_vectorize_related_mode (machine_mode vector_mode, return default_vectorize_related_mode (vector_mode, element_mode, nunits); } +/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */ + +static bool aarch64_vectorize_preferred_div_as_shifts_over_mult (void) +{ + return true; +} + /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, prefer to use the first arithmetic operand as the else value if the else value doesn't matter, since that exactly matches the SVE @@ -24363,46 +24370,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } - -/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ - -bool -aarch64_vectorize_can_special_div_by_constant (enum tree_code code, - tree vectype, wide_int cst, - rtx *output, rtx in0, rtx in1) -{ - if (code != TRUNC_DIV_EXPR - || !TYPE_UNSIGNED (vectype)) - return false; - - machine_mode mode = TYPE_MODE (vectype); - unsigned int flags = aarch64_classify_vector_mode (mode); - if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) - return false; - - int pow = wi::exact_log2 (cst + 1); - auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype)); - /* SVE actually has a div operator, we may have gotten here through - that route. */ - if (pow != (int) (element_precision (vectype) / 2) - || insn_code == CODE_FOR_nothing) - return false; - - /* We can use the optimized pattern. */ - if (in0 == NULL_RTX && in1 == NULL_RTX) - return true; - - gcc_assert (output); - - expand_operand ops[3]; - create_output_operand (&ops[0], *output, mode); - create_input_operand (&ops[1], in0, mode); - create_fixed_operand (&ops[2], in1); - expand_insn (insn_code, 3, ops); - *output = ops[0].value; - return true; -} - /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ @@ -27904,13 +27871,13 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_MAX_ANCHOR_OFFSET #define TARGET_MAX_ANCHOR_OFFSET 4095 +#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT +#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \ + aarch64_vectorize_preferred_div_as_shifts_over_mult + #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment -#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST -#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ - aarch64_vectorize_can_special_div_by_constant - #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ aarch64_vectorize_preferred_vector_alignment