Message ID | Yy19Z/q/HPJ6wm5w@arm.com |
---|---|
State | New |
Headers | show |
Series | None | expand |
Hi All, Ping, and updated patch based on mid-end changes. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): New. * config/aarch64/aarch64.cc (aarch64_vectorize_can_special_div_by_constant): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/div-by-bitmask.c: New test. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0ba6386c1ab50f77e 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>" } ) +;; div optimizations using narrowings +;; we can do the division e.g. shorts by 255 faster by calculating it as +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in +;; double the precision of x. +;; +;; If we imagine a short as being composed of two blocks of bytes then +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to +;; adding 1 to each sub component: +;; +;; short value of 16-bits +;; ┌──────────────┬────────────────┐ +;; │ │ │ +;; └──────────────┴────────────────┘ +;; 8-bit part1 ▲ 8-bit part2 ▲ +;; │ │ +;; │ │ +;; +1 +1 +;; +;; after the first addition, we have to shift right by 8, and narrow the +;; results back to a byte. Remember that the addition must be done in +;; double the precision of the input. Since 8 is half the size of a short +;; we can use a narrowing halfing instruction in AArch64, addhn which also +;; does the addition in a wider precision and narrows back to a byte. The +;; shift itself is implicit in the operation as it writes back only the top +;; half of the result. i.e. bits 2*esize-1:esize. +;; +;; Since we have narrowed the result of the first part back to a byte, for +;; the second addition we can use a widening addition, uaddw. +;; +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8 +;; to shift and the vectorizer. +;; +;; The shift is later optimized by combine to a uzp2 with movi #0. +(define_expand "@aarch64_bitmask_udiv<mode>3" + [(match_operand:VQN 0 "register_operand") + (match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "immediate_operand")] + "TARGET_SIMD" +{ + unsigned HOST_WIDE_INT size + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; + if (!CONST_VECTOR_P (operands[2]) + || const_vector_encoded_nelts (operands[2]) != 1 + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) + FAIL; + + rtx addend = gen_reg_rtx (<MODE>mode); + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); + rtx tmp2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); + DONE; +}) + ;; pmul. (define_insn "aarch64_pmul<mode>" diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb65154127dc7b72386c5a7 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ + +bool +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, + tree vectype, wide_int cst, + rtx *output, rtx in0, rtx in1) +{ + if (code != TRUNC_DIV_EXPR + || !TYPE_UNSIGNED (vectype)) + return false; + + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype)); + if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) + return false; + + if (in0 == NULL_RTX && in1 == NULL_RTX) + { + wide_int val = wi::add (cst, 1); + int pow = wi::exact_log2 (val); + return pow == (int)(element_precision (vectype) / 2); + } + + if (!VECTOR_TYPE_P (vectype)) + return false; + + gcc_assert (output); + + if (!*output) + *output = gen_reg_rtx (TYPE_MODE (vectype)); + + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1)); + return true; +} + /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ @@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_VECTOR_ALIGNMENT #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ + aarch64_vectorize_can_special_div_by_constant + #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ aarch64_vectorize_preferred_vector_alignment diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c new file mode 100644 index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include <stdint.h> + +#pragma GCC target "+nosve" + +/* +** draw_bitmap1: +** ... +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** ... +*/ +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xfe; +} + +/* +** draw_bitmap3: +** ... +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** ... +*/ +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +/* +** draw_bitmap4: +** ... +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** ... +*/ +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} > -----Original Message----- > From: Tamar Christina <tamar.christina@arm.com> > Sent: Friday, September 23, 2022 10:34 AM > To: gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com>; Richard Sandiford > <Richard.Sandiford@arm.com> > Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask division. > > Hi All, > > This adds an implementation for the new optab for unsigned pow2 bitmask > for AArch64. > > The implementation rewrites: > > x = y / (2 ^ (sizeof (y)/2)-1 > > into e.g. (for bytes) > > (x + ((x + 257) >> 8)) >> 8 > > where it's required that the additions be done in double the precision of x > such that we don't lose any bits during an overflow. > > Essentially the sequence decomposes the division into doing two smaller > divisions, one for the top and bottom parts of the number and adding the > results back together. > > To account for the fact that shift by 8 would be division by 256 we add 1 to > both parts of x such that when 255 we still get 1 as the answer. > > Because the amount we shift are half the original datatype we can use the > halfing instructions the ISA provides to do the operation instead of using > actual shifts. > > For AArch64 this means we generate for: > > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > for (int i = 0; i < (n & -16); i+=1) > pixel[i] = (pixel[i] * level) / 0xff; } > > the following: > > movi v3.16b, 0x1 > umull2 v1.8h, v0.16b, v2.16b > umull v0.8h, v0.8b, v2.8b > addhn v5.8b, v1.8h, v3.8h > addhn v4.8b, v0.8h, v3.8h > uaddw v1.8h, v1.8h, v5.8b > uaddw v0.8h, v0.8h, v4.8b > uzp2 v0.16b, v0.16b, v1.16b > > instead of: > > umull v2.8h, v1.8b, v5.8b > umull2 v1.8h, v1.16b, v5.16b > umull v0.4s, v2.4h, v3.4h > umull2 v2.4s, v2.8h, v3.8h > umull v4.4s, v1.4h, v3.4h > umull2 v1.4s, v1.8h, v3.8h > uzp2 v0.8h, v0.8h, v2.8h > uzp2 v1.8h, v4.8h, v1.8h > shrn v0.8b, v0.8h, 7 > shrn2 v0.16b, v1.8h, 7 > > Which results in significantly faster code. > > Thanks for Wilco for the concept. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md > (@aarch64_bitmask_udiv<mode>3): New. > * config/aarch64/aarch64.cc > (aarch64_vectorize_can_special_div_by_constant): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/div-by-bitmask.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b > a6386c1ab50f77e 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4831,6 +4831,65 @@ (define_expand > "aarch64_<sur><addsub>hn2<mode>" > } > ) > > +;; div optimizations using narrowings > +;; we can do the division e.g. shorts by 255 faster by calculating it > +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;; > +double the precision of x. > +;; > +;; If we imagine a short as being composed of two blocks of bytes then > +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;; > +adding 1 to each sub component: > +;; > +;; short value of 16-bits > +;; ┌──────────────┬────────────────┐ > +;; │ │ │ > +;; └──────────────┴────────────────┘ > +;; 8-bit part1 ▲ 8-bit part2 ▲ > +;; │ │ > +;; │ │ > +;; +1 +1 > +;; > +;; after the first addition, we have to shift right by 8, and narrow > +the ;; results back to a byte. Remember that the addition must be done > +in ;; double the precision of the input. Since 8 is half the size of a > +short ;; we can use a narrowing halfing instruction in AArch64, addhn > +which also ;; does the addition in a wider precision and narrows back > +to a byte. The ;; shift itself is implicit in the operation as it > +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize. > +;; > +;; Since we have narrowed the result of the first part back to a byte, > +for ;; the second addition we can use a widening addition, uaddw. > +;; > +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr > +by 8 ;; to shift and the vectorizer. > +;; > +;; The shift is later optimized by combine to a uzp2 with movi #0. > +(define_expand "@aarch64_bitmask_udiv<mode>3" > + [(match_operand:VQN 0 "register_operand") > + (match_operand:VQN 1 "register_operand") > + (match_operand:VQN 2 "immediate_operand")] > + "TARGET_SIMD" > +{ > + unsigned HOST_WIDE_INT size > + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > + if (!CONST_VECTOR_P (operands[2]) > + || const_vector_encoded_nelts (operands[2]) != 1 > + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) > + FAIL; > + > + rtx addend = gen_reg_rtx (<MODE>mode); > + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > +<VNARROWQ2>mode)); > + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > +bitsize); > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > +shift_vector)); > + DONE; > +}) > + > ;; pmul. > > (define_insn "aarch64_pmul<mode>" > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc3 > 7484b6fc6901bfb4 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const > (machine_mode vmode, machine_mode op_mode, > return ret; > } > > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > + > +bool > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > + tree vectype, > + tree treeop0, tree treeop1, > + rtx *output, rtx in0, rtx in1) { > + > + if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX)) > + return false; > + > + tree cst = uniform_integer_cst_p (treeop1); tree type; if (code != > + TRUNC_DIV_EXPR > + || !cst > + || !TYPE_UNSIGNED ((type = TREE_TYPE (cst))) > + || tree_int_cst_sgn (cst) != 1) > + return false; > + > + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE > + (vectype)); if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > + return false; > + > + if (in0 == NULL_RTX && in1 == NULL_RTX) > + { > + gcc_assert (treeop0 && treeop1); > + wide_int icst = wi::to_wide (cst); > + wide_int val = wi::add (icst, 1); > + int pow = wi::exact_log2 (val); > + return pow == (TYPE_PRECISION (type) / 2); > + } > + > + if (!VECTOR_TYPE_P (vectype)) > + return false; > + > + gcc_assert (output); > + > + if (!*output) > + *output = gen_reg_rtx (TYPE_MODE (vectype)); > + > + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, > +in0, in1)); > + return true; > +} > + > /* Generate a byte permute mask for a register of mode MODE, > which has NUNITS units. */ > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d2 > 44a2a23e76cac097 100644 > --- a/gcc/doc/tm.texi > +++ b/gcc/doc/tm.texi > @@ -6112,6 +6112,22 @@ instruction pattern. There is no need for the hook > to handle these two implementation approaches itself. > @end deftypefn > > +@deftypefn {Target Hook} bool > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree > +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This > +hook is used to test whether the target has a special method of > +division of vectors of type @var{vectype} using the two operands > @code{treeop0}, and @code{treeop1} and producing a vector of type > @var{vectype}. The division will then not be decomposed by the and kept as > a div. > + > +When the hook is being used to test whether the target supports a > +special divide, @var{in0}, @var{in1}, and @var{output} are all null. > +When the hook is being used to emit a division, @var{in0} and @var{in1} > +are the source vectors of type @var{vecttype} and @var{output} is the > +destination vector of type @var{vectype}. > + > +Return true if the operation is possible, emitting instructions for it > +if rtxes are provided and updating @var{output}. > +@end deftypefn > + > @deftypefn {Target Hook} tree > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in}) This hook > should return the decl of a function that implements the vectorized variant > of the function with the @code{combined_fn} code diff --git > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04 > 076d058c24ce093 100644 > --- a/gcc/doc/tm.texi.in > +++ b/gcc/doc/tm.texi.in > @@ -4164,6 +4164,8 @@ address; but often a machine-dependent strategy > can generate better code. > > @hook TARGET_VECTORIZE_VEC_PERM_CONST > > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > + > @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION > > @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION > diff --git a/gcc/explow.cc b/gcc/explow.cc index > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f > 5e346bf34ba0036 100644 > --- a/gcc/explow.cc > +++ b/gcc/explow.cc > @@ -1037,7 +1037,7 @@ round_push (rtx size) > TRUNC_DIV_EXPR. */ > size = expand_binop (Pmode, add_optab, size, alignm1_rtx, > NULL_RTX, 1, OPTAB_LIB_WIDEN); > - size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx, > + size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, > + align_rtx, > NULL_RTX, 1); > size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1); > > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned > required_align) > gen_int_mode (required_align / BITS_PER_UNIT - 1, > Pmode), > NULL_RTX, 1, OPTAB_LIB_WIDEN); > - target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target, > + target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, > target, > gen_int_mode (required_align / BITS_PER_UNIT, > Pmode), > NULL_RTX, 1); > diff --git a/gcc/expmed.h b/gcc/expmed.h index > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6 > f33cb3595659b5 100644 > --- a/gcc/expmed.h > +++ b/gcc/expmed.h > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, > machine_mode, extern rtx expand_shift (enum tree_code, machine_mode, > rtx, poly_int64, rtx, > int); > #ifdef GCC_OPTABS_H > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx, > - rtx, int, enum optab_methods = > OPTAB_LIB_WIDEN); > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, > tree, > + rtx, rtx, rtx, int, > + enum optab_methods = OPTAB_LIB_WIDEN); > #endif > #endif > > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index > 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb09 > 90db8b97d3af414 100644 > --- a/gcc/expmed.cc > +++ b/gcc/expmed.cc > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx > op0, HOST_WIDE_INT d) > > rtx > expand_divmod (int rem_flag, enum tree_code code, machine_mode > mode, > - rtx op0, rtx op1, rtx target, int unsignedp, > - enum optab_methods methods) > + tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target, > + int unsignedp, enum optab_methods methods) > { > machine_mode compute_mode; > rtx tquotient; > @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code > code, machine_mode mode, > > last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0; > > + /* Check if the target has specific expansions for the division. */ > + if (treeop0 > + && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE > (treeop0), > + treeop0, treeop1, > + &target, op0, op1)) > + return target; > + > + > /* Now convert to the best mode to use. */ > if (compute_mode != mode) > { > @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code > code, machine_mode mode, > || (optab_handler (sdivmod_optab, int_mode) > != CODE_FOR_nothing))) > quotient = expand_divmod (0, TRUNC_DIV_EXPR, > - int_mode, op0, > - gen_int_mode (abs_d, > + int_mode, treeop0, treeop1, > + op0, gen_int_mode (abs_d, > int_mode), > NULL_RTX, 0); > else > @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code > code, machine_mode mode, > size - 1, NULL_RTX, 0); > t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign), > NULL_RTX); > - t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, > op1, > - NULL_RTX, 0); > + t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, > treeop0, > + treeop1, t3, op1, NULL_RTX, 0); > if (t4) > { > rtx t5; > diff --git a/gcc/expr.cc b/gcc/expr.cc > index > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96 > a8abc055fa34d9 100644 > --- a/gcc/expr.cc > +++ b/gcc/expr.cc > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target) > return expand_divmod (0, > FLOAT_MODE_P (GET_MODE (value)) > ? RDIV_EXPR : TRUNC_DIV_EXPR, > - GET_MODE (value), op1, op2, target, 0); > + GET_MODE (value), NULL, NULL, op1, op2, > + target, 0); > case MOD: > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > op1, op2, > - target, 0); > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > NULL, NULL, > + op1, op2, target, 0); > case UDIV: > - return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > op1, op2, > - target, 1); > + return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > NULL, NULL, > + op1, op2, target, 1); > case UMOD: > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > op1, op2, > - target, 1); > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > NULL, NULL, > + op1, op2, target, 1); > case ASHIFTRT: > return expand_simple_binop (GET_MODE (value), code, op1, op2, > target, 0, OPTAB_LIB_WIDEN); > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, > machine_mode mode, tree treeop0, > bool speed_p = optimize_insn_for_speed_p (); > do_pending_stack_adjust (); > start_sequence (); > - rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1); > + rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > + op0, op1, target, 1); > rtx_insn *uns_insns = get_insns (); > end_sequence (); > start_sequence (); > - rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0); > + rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > + op0, op1, target, 0); > rtx_insn *sgn_insns = get_insns (); > end_sequence (); > unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8 > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree > treeop0, > emit_insn (sgn_insns); > return sgn_ret; > } > - return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp); > + return expand_divmod (mod_p, code, mode, treeop0, treeop1, > + op0, op1, target, unsignedp); > } > > rtx > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd > 872f340855dc96 100644 > --- a/gcc/optabs.cc > +++ b/gcc/optabs.cc > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, > rtx op0, rtx op1, bool unsignedp) > return NULL_RTX; > } > } > - rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > sum, > - gen_int_mode (INTVAL (op1), > word_mode), > + rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > NULL, NULL, > + sum, gen_int_mode (INTVAL (op1), > + word_mode), > NULL_RTX, 1, OPTAB_DIRECT); > if (remainder == NULL_RTX) > return NULL_RTX; > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode > mode, rtx op0, rtx op1, rtx *rem, > > if (op11 != const1_rtx) > { > - rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11, > - NULL_RTX, unsignedp, OPTAB_DIRECT); > + rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, > quot1, > + op11, NULL_RTX, unsignedp, > OPTAB_DIRECT); > if (rem2 == NULL_RTX) > return NULL_RTX; > > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode > mode, rtx op0, rtx op1, rtx *rem, > if (rem2 == NULL_RTX) > return NULL_RTX; > > - rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11, > - NULL_RTX, unsignedp, OPTAB_DIRECT); > + rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, > quot1, > + op11, NULL_RTX, unsignedp, > OPTAB_DIRECT); > if (quot2 == NULL_RTX) > return NULL_RTX; > > diff --git a/gcc/target.def b/gcc/target.def index > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b070 > 81cdd70113db9b1 100644 > --- a/gcc/target.def > +++ b/gcc/target.def > @@ -1902,6 +1902,25 @@ implementation approaches itself.", > const vec_perm_indices &sel), > NULL) > > +DEFHOOK > +(can_special_div_by_const, > + "This hook is used to test whether the target has a special method > +of\n\ division of vectors of type @var{vectype} using the two operands > +@code{treeop0},\n\ and @code{treeop1} and producing a vector of type > +@var{vectype}. The division\n\ will then not be decomposed by the and > +kept as a div.\n\ \n\ When the hook is being used to test whether the > +target supports a special\n\ divide, @var{in0}, @var{in1}, and > +@var{output} are all null. When the hook\n\ is being used to emit a > +division, @var{in0} and @var{in1} are the source\n\ vectors of type > +@var{vecttype} and @var{output} is the destination vector of\n\ type > +@var{vectype}.\n\ \n\ Return true if the operation is possible, > +emitting instructions for it\n\ if rtxes are provided and updating > +@var{output}.", bool, (enum tree_code, tree vectype, tree treeop0, > +tree treeop1, rtx *output, > + rtx in0, rtx in1), > + default_can_special_div_by_const) > + > /* Return true if the target supports misaligned store/load of a > specific factor denoted in the third parameter. The last parameter > is true if the access is defined in a packed struct. */ diff --git a/gcc/target.h > b/gcc/target.h index > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56 > f39c061f68b665 100644 > --- a/gcc/target.h > +++ b/gcc/target.h > @@ -51,6 +51,7 @@ > #include "insn-codes.h" > #include "tm.h" > #include "hard-reg-set.h" > +#include "tree-core.h" > > #if CHECKING_P > > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index > ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e > 2640d63f936b336d 100644 > --- a/gcc/targhooks.h > +++ b/gcc/targhooks.h > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage > (addr_space_t, location_t); extern rtx default_addr_space_convert (rtx, > tree, tree); extern unsigned int default_case_values_threshold (void); > extern bool default_have_conditional_execution (void); > +extern bool default_can_special_div_by_const (enum tree_code, tree, > tree, tree, > + rtx *, rtx, rtx); > > extern bool default_libc_has_function (enum function_class, tree); extern > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc > b/gcc/targhooks.cc index > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241 > 279936ced41ee95 100644 > --- a/gcc/targhooks.cc > +++ b/gcc/targhooks.cc > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void) > return HAVE_conditional_execution; > } > > +/* Default that no division by constant operations are special. */ > +bool default_can_special_div_by_const (enum tree_code, tree, tree, > +tree, rtx *, rtx, > + rtx) > +{ > + return false; > +} > + > /* By default we assume that c99 functions are present at the runtime, > but sincos is not. */ > bool > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3 > d7b4d5b64a19b9 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > @@ -0,0 +1,25 @@ > +/* { dg-require-effective-target vect_int } */ > + > +#include <stdint.h> > +#include "tree-vect.h" > + > +#define N 50 > +#define TYPE uint8_t > + > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * level) / 0xff; } > + > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * level) / 0xff; } > + > +#include "vect-div-bitmask.h" > + > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > +detected" "vect" { target aarch64*-*-* } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3 > db75b3e4112e2cc > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > @@ -0,0 +1,25 @@ > +/* { dg-require-effective-target vect_int } */ > + > +#include <stdint.h> > +#include "tree-vect.h" > + > +#define N 50 > +#define TYPE uint16_t > + > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * level) / 0xffffU; } > + > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * level) / 0xffffU; } > + > +#include "vect-div-bitmask.h" > + > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > +detected" "vect" { target aarch64*-*-* } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720 > 157701d9d1cf852 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > @@ -0,0 +1,26 @@ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* > +} } */ > + > +#include <stdint.h> > +#include "tree-vect.h" > + > +#define N 50 > +#define TYPE uint32_t > + > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > + > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > +restrict pixel, TYPE level, int n) { > + for (int i = 0; i < n; i+=1) > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > + > +#include "vect-div-bitmask.h" > + > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > +detected" "vect" { target aarch64*-*-* } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > new file mode 100644 > index > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1 > 832f28ebd07993e > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > @@ -0,0 +1,43 @@ > +#include <stdio.h> > + > +#ifndef N > +#define N 65 > +#endif > + > +#ifndef TYPE > +#define TYPE uint32_t > +#endif > + > +#ifndef DEBUG > +#define DEBUG 0 > +#endif > + > +#define BASE ((TYPE) -1 < 0 ? -126 : 4) > + > +int main () > +{ > + TYPE a[N]; > + TYPE b[N]; > + > + for (int i = 0; i < N; ++i) > + { > + a[i] = BASE + i * 13; > + b[i] = BASE + i * 13; > + if (DEBUG) > + printf ("%d: 0x%x\n", i, a[i]); > + } > + > + fun1 (a, N / 2, N); > + fun2 (b, N / 2, N); > + > + for (int i = 0; i < N; ++i) > + { > + if (DEBUG) > + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); > + > + if (a[i] != b[i]) > + __builtin_abort (); > + } > + return 0; > +} > + > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a > b211cd246d82d5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > @@ -0,0 +1,61 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -std=c99" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } > +*/ > + > +#include <stdint.h> > + > +#pragma GCC target "+nosve" > + > +/* > +** draw_bitmap1: > +** ... > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b > +** ... > +*/ > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xff; } > + > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xfe; } > + > +/* > +** draw_bitmap3: > +** ... > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > +** ... > +*/ > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xffffU; } > + > +/* > +** draw_bitmap4: > +** ... > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c > 9a12046b6ec94f3 100644 > --- a/gcc/tree-vect-generic.cc > +++ b/gcc/tree-vect-generic.cc > @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator > *gsi, tree type, tree compute_type > tree rhs2 = gimple_assign_rhs2 (assign); > tree ret; > > + /* Check if the target was going to handle it through the special > + division callback hook. */ > + if (targetm.vectorize.can_special_div_by_const (code, type, rhs1, > + rhs2, NULL, > + NULL_RTX, > NULL_RTX)) > + return NULL_TREE; > + > + > if (!optimize > || !VECTOR_INTEGER_TYPE_P (type) > || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect- > patterns.cc b/gcc/tree-vect-patterns.cc index > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af > 0b1bfea10fe443 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo, > > return pattern_stmt; > } > + else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype, > + oprnd0, oprnd1, NULL, > + NULL_RTX, NULL_RTX)) > + { > + return NULL; > + } > > if (prec > HOST_BITS_PER_WIDE_INT > || integer_zerop (oprnd1)) > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index > c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd6 > 8e0e1c1e93faafe 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo, > } > target_support_p = (optab_handler (optab, vec_mode) > != CODE_FOR_nothing); > + if (!target_support_p) > + target_support_p > + = targetm.vectorize.can_special_div_by_const (code, vectype, > + op0, op1, NULL, > + NULL_RTX, > NULL_RTX); > } > > bool using_emulated_vectors_p = vect_emulated_vector_p (vectype); > > > > > --
Ping > -----Original Message----- > From: Tamar Christina > Sent: Monday, October 31, 2022 11:35 AM > To: 'Tamar Christina' <tamar.christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com>; Richard Sandiford > <Richard.Sandiford@arm.com> > Subject: RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask > division. > > Hi All, > > Ping, and updated patch based on mid-end changes. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md > (@aarch64_bitmask_udiv<mode>3): New. > * config/aarch64/aarch64.cc > (aarch64_vectorize_can_special_div_by_constant): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/div-by-bitmask.c: New test. > > --- inline copy of patch --- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b > a6386c1ab50f77e 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4831,6 +4831,65 @@ (define_expand > "aarch64_<sur><addsub>hn2<mode>" > } > ) > > +;; div optimizations using narrowings > +;; we can do the division e.g. shorts by 255 faster by calculating it > +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;; > +double the precision of x. > +;; > +;; If we imagine a short as being composed of two blocks of bytes then > +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;; > +adding 1 to each sub component: > +;; > +;; short value of 16-bits > +;; ┌──────────────┬────────────────┐ > +;; │ │ │ > +;; └──────────────┴────────────────┘ > +;; 8-bit part1 ▲ 8-bit part2 ▲ > +;; │ │ > +;; │ │ > +;; +1 +1 > +;; > +;; after the first addition, we have to shift right by 8, and narrow > +the ;; results back to a byte. Remember that the addition must be done > +in ;; double the precision of the input. Since 8 is half the size of a > +short ;; we can use a narrowing halfing instruction in AArch64, addhn > +which also ;; does the addition in a wider precision and narrows back > +to a byte. The ;; shift itself is implicit in the operation as it > +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize. > +;; > +;; Since we have narrowed the result of the first part back to a byte, > +for ;; the second addition we can use a widening addition, uaddw. > +;; > +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr > +by 8 ;; to shift and the vectorizer. > +;; > +;; The shift is later optimized by combine to a uzp2 with movi #0. > +(define_expand "@aarch64_bitmask_udiv<mode>3" > + [(match_operand:VQN 0 "register_operand") > + (match_operand:VQN 1 "register_operand") > + (match_operand:VQN 2 "immediate_operand")] > + "TARGET_SIMD" > +{ > + unsigned HOST_WIDE_INT size > + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > + if (!CONST_VECTOR_P (operands[2]) > + || const_vector_encoded_nelts (operands[2]) != 1 > + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) > + FAIL; > + > + rtx addend = gen_reg_rtx (<MODE>mode); > + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > +<VNARROWQ2>mode)); > + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > +bitsize); > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > +shift_vector)); > + DONE; > +}) > + > ;; pmul. > > (define_insn "aarch64_pmul<mode>" > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb65154 > 127dc7b72386c5a7 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const > (machine_mode vmode, machine_mode op_mode, > return ret; > } > > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > + > +bool > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > + tree vectype, wide_int cst, > + rtx *output, rtx in0, rtx in1) { > + if (code != TRUNC_DIV_EXPR > + || !TYPE_UNSIGNED (vectype)) > + return false; > + > + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE > + (vectype)); if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > + return false; > + > + if (in0 == NULL_RTX && in1 == NULL_RTX) > + { > + wide_int val = wi::add (cst, 1); > + int pow = wi::exact_log2 (val); > + return pow == (int)(element_precision (vectype) / 2); > + } > + > + if (!VECTOR_TYPE_P (vectype)) > + return false; > + > + gcc_assert (output); > + > + if (!*output) > + *output = gen_reg_rtx (TYPE_MODE (vectype)); > + > + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, > +in0, in1)); > + return true; > +} > + > /* Generate a byte permute mask for a register of mode MODE, > which has NUNITS units. */ > > @@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p > #undef TARGET_VECTOR_ALIGNMENT > #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment > > +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ > + aarch64_vectorize_can_special_div_by_constant > + > #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > aarch64_vectorize_preferred_vector_alignment > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a > b211cd246d82d5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > @@ -0,0 +1,61 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -std=c99" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } > +*/ > + > +#include <stdint.h> > + > +#pragma GCC target "+nosve" > + > +/* > +** draw_bitmap1: > +** ... > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b > +** ... > +*/ > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xff; } > + > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xfe; } > + > +/* > +** draw_bitmap3: > +** ... > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > +** ... > +*/ > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xffffU; } > + > +/* > +** draw_bitmap4: > +** ... > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) { > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > > -----Original Message----- > > From: Tamar Christina <tamar.christina@arm.com> > > Sent: Friday, September 23, 2022 10:34 AM > > To: gcc-patches@gcc.gnu.org > > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; > > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov > > <Kyrylo.Tkachov@arm.com>; Richard Sandiford > > <Richard.Sandiford@arm.com> > > Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask > division. > > > > Hi All, > > > > This adds an implementation for the new optab for unsigned pow2 > > bitmask for AArch64. > > > > The implementation rewrites: > > > > x = y / (2 ^ (sizeof (y)/2)-1 > > > > into e.g. (for bytes) > > > > (x + ((x + 257) >> 8)) >> 8 > > > > where it's required that the additions be done in double the precision > > of x such that we don't lose any bits during an overflow. > > > > Essentially the sequence decomposes the division into doing two > > smaller divisions, one for the top and bottom parts of the number and > > adding the results back together. > > > > To account for the fact that shift by 8 would be division by 256 we > > add 1 to both parts of x such that when 255 we still get 1 as the answer. > > > > Because the amount we shift are half the original datatype we can use > > the halfing instructions the ISA provides to do the operation instead > > of using actual shifts. > > > > For AArch64 this means we generate for: > > > > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > > for (int i = 0; i < (n & -16); i+=1) > > pixel[i] = (pixel[i] * level) / 0xff; } > > > > the following: > > > > movi v3.16b, 0x1 > > umull2 v1.8h, v0.16b, v2.16b > > umull v0.8h, v0.8b, v2.8b > > addhn v5.8b, v1.8h, v3.8h > > addhn v4.8b, v0.8h, v3.8h > > uaddw v1.8h, v1.8h, v5.8b > > uaddw v0.8h, v0.8h, v4.8b > > uzp2 v0.16b, v0.16b, v1.16b > > > > instead of: > > > > umull v2.8h, v1.8b, v5.8b > > umull2 v1.8h, v1.16b, v5.16b > > umull v0.4s, v2.4h, v3.4h > > umull2 v2.4s, v2.8h, v3.8h > > umull v4.4s, v1.4h, v3.4h > > umull2 v1.4s, v1.8h, v3.8h > > uzp2 v0.8h, v0.8h, v2.8h > > uzp2 v1.8h, v4.8h, v1.8h > > shrn v0.8b, v0.8h, 7 > > shrn2 v0.16b, v1.8h, 7 > > > > Which results in significantly faster code. > > > > Thanks for Wilco for the concept. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-simd.md > > (@aarch64_bitmask_udiv<mode>3): New. > > * config/aarch64/aarch64.cc > > (aarch64_vectorize_can_special_div_by_constant): New. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/div-by-bitmask.c: New test. > > > > --- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f0b > > a6386c1ab50f77e 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -4831,6 +4831,65 @@ (define_expand > > "aarch64_<sur><addsub>hn2<mode>" > > } > > ) > > > > +;; div optimizations using narrowings ;; we can do the division e.g. > > +shorts by 255 faster by calculating it as ;; (x + ((x + 257) >> 8)) > > +>> 8 assuming the operation is done in ;; double the precision of x. > > +;; > > +;; If we imagine a short as being composed of two blocks of bytes > > +then ;; adding 257 or 0b0000_0001_0000_0001 to the number is > > +equivalen to ;; adding 1 to each sub component: > > +;; > > +;; short value of 16-bits > > +;; ┌──────────────┬────────────────┐ > > +;; │ │ │ > > +;; └──────────────┴────────────────┘ > > +;; 8-bit part1 ▲ 8-bit part2 ▲ > > +;; │ │ > > +;; │ │ > > +;; +1 +1 > > +;; > > +;; after the first addition, we have to shift right by 8, and narrow > > +the ;; results back to a byte. Remember that the addition must be > > +done in ;; double the precision of the input. Since 8 is half the > > +size of a short ;; we can use a narrowing halfing instruction in > > +AArch64, addhn which also ;; does the addition in a wider precision > > +and narrows back to a byte. The ;; shift itself is implicit in the > > +operation as it writes back only the top ;; half of the result. i.e. bits > 2*esize-1:esize. > > +;; > > +;; Since we have narrowed the result of the first part back to a > > +byte, for ;; the second addition we can use a widening addition, uaddw. > > +;; > > +;; For the finaly shift, since it's unsigned arithmatic we emit an > > +ushr by 8 ;; to shift and the vectorizer. > > +;; > > +;; The shift is later optimized by combine to a uzp2 with movi #0. > > +(define_expand "@aarch64_bitmask_udiv<mode>3" > > + [(match_operand:VQN 0 "register_operand") > > + (match_operand:VQN 1 "register_operand") > > + (match_operand:VQN 2 "immediate_operand")] > > + "TARGET_SIMD" > > +{ > > + unsigned HOST_WIDE_INT size > > + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > > + if (!CONST_VECTOR_P (operands[2]) > > + || const_vector_encoded_nelts (operands[2]) != 1 > > + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) > > + FAIL; > > + > > + rtx addend = gen_reg_rtx (<MODE>mode); > > + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, > 1); > > + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > +<VNARROWQ2>mode)); > > + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > > + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > > + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > > + rtx shift_vector = aarch64_simd_gen_const_vector_dup > (<MODE>mode, > > +bitsize); > > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], > tmp1)); > > + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > > +shift_vector)); > > + DONE; > > +}) > > + > > ;; pmul. > > > > (define_insn "aarch64_pmul<mode>" > > diff --git a/gcc/config/aarch64/aarch64.cc > > b/gcc/config/aarch64/aarch64.cc index > > > 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc3 > > 7484b6fc6901bfb4 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const > > (machine_mode vmode, machine_mode op_mode, > > return ret; > > } > > > > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > > + > > +bool > > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > > + tree vectype, > > + tree treeop0, tree treeop1, > > + rtx *output, rtx in0, rtx in1) { > > + > > + if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX)) > > + return false; > > + > > + tree cst = uniform_integer_cst_p (treeop1); tree type; if (code > > + != TRUNC_DIV_EXPR > > + || !cst > > + || !TYPE_UNSIGNED ((type = TREE_TYPE (cst))) > > + || tree_int_cst_sgn (cst) != 1) > > + return false; > > + > > + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE > > + (vectype)); if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > > + return false; > > + > > + if (in0 == NULL_RTX && in1 == NULL_RTX) > > + { > > + gcc_assert (treeop0 && treeop1); > > + wide_int icst = wi::to_wide (cst); > > + wide_int val = wi::add (icst, 1); > > + int pow = wi::exact_log2 (val); > > + return pow == (TYPE_PRECISION (type) / 2); > > + } > > + > > + if (!VECTOR_TYPE_P (vectype)) > > + return false; > > + > > + gcc_assert (output); > > + > > + if (!*output) > > + *output = gen_reg_rtx (TYPE_MODE (vectype)); > > + > > + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), > *output, > > +in0, in1)); > > + return true; > > +} > > + > > /* Generate a byte permute mask for a register of mode MODE, > > which has NUNITS units. */ > > > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index > > > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d2 > > 44a2a23e76cac097 100644 > > --- a/gcc/doc/tm.texi > > +++ b/gcc/doc/tm.texi > > @@ -6112,6 +6112,22 @@ instruction pattern. There is no need for the > > hook to handle these two implementation approaches itself. > > @end deftypefn > > > > +@deftypefn {Target Hook} bool > > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree > > +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This > > +hook is used to test whether the target has a special method of > > +division of vectors of type @var{vectype} using the two operands > > @code{treeop0}, and @code{treeop1} and producing a vector of type > > @var{vectype}. The division will then not be decomposed by the and > > kept as a div. > > + > > +When the hook is being used to test whether the target supports a > > +special divide, @var{in0}, @var{in1}, and @var{output} are all null. > > +When the hook is being used to emit a division, @var{in0} and > > +@var{in1} are the source vectors of type @var{vecttype} and > > +@var{output} is the destination vector of type @var{vectype}. > > + > > +Return true if the operation is possible, emitting instructions for > > +it if rtxes are provided and updating @var{output}. > > +@end deftypefn > > + > > @deftypefn {Target Hook} tree > > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned > @var{code}, > > tree @var{vec_type_out}, tree @var{vec_type_in}) This hook should > > return the decl of a function that implements the vectorized variant > > of the function with the @code{combined_fn} code diff --git > > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index > > > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04 > > 076d058c24ce093 100644 > > --- a/gcc/doc/tm.texi.in > > +++ b/gcc/doc/tm.texi.in > > @@ -4164,6 +4164,8 @@ address; but often a machine-dependent > strategy > > can generate better code. > > > > @hook TARGET_VECTORIZE_VEC_PERM_CONST > > > > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > + > > @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION > > > > @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION > > diff --git a/gcc/explow.cc b/gcc/explow.cc index > > > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f > > 5e346bf34ba0036 100644 > > --- a/gcc/explow.cc > > +++ b/gcc/explow.cc > > @@ -1037,7 +1037,7 @@ round_push (rtx size) > > TRUNC_DIV_EXPR. */ > > size = expand_binop (Pmode, add_optab, size, alignm1_rtx, > > NULL_RTX, 1, OPTAB_LIB_WIDEN); > > - size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx, > > + size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, > > + align_rtx, > > NULL_RTX, 1); > > size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1); > > > > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned > > required_align) > > gen_int_mode (required_align / BITS_PER_UNIT - 1, > > Pmode), > > NULL_RTX, 1, OPTAB_LIB_WIDEN); > > - target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target, > > + target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, > > target, > > gen_int_mode (required_align / BITS_PER_UNIT, > > Pmode), > > NULL_RTX, 1); > > diff --git a/gcc/expmed.h b/gcc/expmed.h index > > > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6 > > f33cb3595659b5 100644 > > --- a/gcc/expmed.h > > +++ b/gcc/expmed.h > > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, > > machine_mode, extern rtx expand_shift (enum tree_code, > machine_mode, > > rtx, poly_int64, rtx, > > int); > > #ifdef GCC_OPTABS_H > > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx, > > - rtx, int, enum optab_methods = > > OPTAB_LIB_WIDEN); > > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, > > tree, > > + rtx, rtx, rtx, int, > > + enum optab_methods = OPTAB_LIB_WIDEN); > > #endif > > #endif > > > > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index > > > 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb09 > > 90db8b97d3af414 100644 > > --- a/gcc/expmed.cc > > +++ b/gcc/expmed.cc > > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx > op0, > > HOST_WIDE_INT d) > > > > rtx > > expand_divmod (int rem_flag, enum tree_code code, machine_mode > mode, > > - rtx op0, rtx op1, rtx target, int unsignedp, > > - enum optab_methods methods) > > + tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target, > > + int unsignedp, enum optab_methods methods) > > { > > machine_mode compute_mode; > > rtx tquotient; > > @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > > > last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0; > > > > + /* Check if the target has specific expansions for the division. > > + */ if (treeop0 > > + && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE > > (treeop0), > > + treeop0, treeop1, > > + &target, op0, op1)) > > + return target; > > + > > + > > /* Now convert to the best mode to use. */ > > if (compute_mode != mode) > > { > > @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > || (optab_handler (sdivmod_optab, int_mode) > > != CODE_FOR_nothing))) > > quotient = expand_divmod (0, TRUNC_DIV_EXPR, > > - int_mode, op0, > > - gen_int_mode (abs_d, > > + int_mode, treeop0, treeop1, > > + op0, gen_int_mode (abs_d, > > int_mode), > > NULL_RTX, 0); > > else > > @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > size - 1, NULL_RTX, 0); > > t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign), > > NULL_RTX); > > - t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, > > op1, > > - NULL_RTX, 0); > > + t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, > > treeop0, > > + treeop1, t3, op1, NULL_RTX, 0); > > if (t4) > > { > > rtx t5; > > diff --git a/gcc/expr.cc b/gcc/expr.cc index > > > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96 > > a8abc055fa34d9 100644 > > --- a/gcc/expr.cc > > +++ b/gcc/expr.cc > > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target) > > return expand_divmod (0, > > FLOAT_MODE_P (GET_MODE (value)) > > ? RDIV_EXPR : TRUNC_DIV_EXPR, > > - GET_MODE (value), op1, op2, target, 0); > > + GET_MODE (value), NULL, NULL, op1, op2, > > + target, 0); > > case MOD: > > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > op1, op2, > > - target, 0); > > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 0); > > case UDIV: > > - return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > > op1, op2, > > - target, 1); > > + return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 1); > > case UMOD: > > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > op1, op2, > > - target, 1); > > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 1); > > case ASHIFTRT: > > return expand_simple_binop (GET_MODE (value), code, op1, op2, > > target, 0, OPTAB_LIB_WIDEN); @@ - > 8990,11 +8991,13 @@ > > expand_expr_divmod (tree_code code, machine_mode mode, tree > treeop0, > > bool speed_p = optimize_insn_for_speed_p (); > > do_pending_stack_adjust (); > > start_sequence (); > > - rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, > 1); > > + rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, 1); > > rtx_insn *uns_insns = get_insns (); > > end_sequence (); > > start_sequence (); > > - rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, > 0); > > + rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, 0); > > rtx_insn *sgn_insns = get_insns (); > > end_sequence (); > > unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 > > +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode > mode, tree > > treeop0, > > emit_insn (sgn_insns); > > return sgn_ret; > > } > > - return expand_divmod (mod_p, code, mode, op0, op1, target, > > unsignedp); > > + return expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, unsignedp); > > } > > > > rtx > > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index > > > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd > > 872f340855dc96 100644 > > --- a/gcc/optabs.cc > > +++ b/gcc/optabs.cc > > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode > mode, rtx > > op0, rtx op1, bool unsignedp) > > return NULL_RTX; > > } > > } > > - rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > > sum, > > - gen_int_mode (INTVAL (op1), > > word_mode), > > + rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > > NULL, NULL, > > + sum, gen_int_mode (INTVAL (op1), > > + word_mode), > > NULL_RTX, 1, OPTAB_DIRECT); > > if (remainder == NULL_RTX) > > return NULL_RTX; > > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode > mode, rtx > > op0, rtx op1, rtx *rem, > > > > if (op11 != const1_rtx) > > { > > - rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, > op11, > > - NULL_RTX, unsignedp, OPTAB_DIRECT); > > + rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, > NULL, > > quot1, > > + op11, NULL_RTX, unsignedp, > > OPTAB_DIRECT); > > if (rem2 == NULL_RTX) > > return NULL_RTX; > > > > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode > mode, rtx > > op0, rtx op1, rtx *rem, > > if (rem2 == NULL_RTX) > > return NULL_RTX; > > > > - rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11, > > - NULL_RTX, unsignedp, OPTAB_DIRECT); > > + rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, > > quot1, > > + op11, NULL_RTX, unsignedp, > > OPTAB_DIRECT); > > if (quot2 == NULL_RTX) > > return NULL_RTX; > > > > diff --git a/gcc/target.def b/gcc/target.def index > > > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b070 > > 81cdd70113db9b1 100644 > > --- a/gcc/target.def > > +++ b/gcc/target.def > > @@ -1902,6 +1902,25 @@ implementation approaches itself.", > > const vec_perm_indices &sel), > > NULL) > > > > +DEFHOOK > > +(can_special_div_by_const, > > + "This hook is used to test whether the target has a special method > > +of\n\ division of vectors of type @var{vectype} using the two > > +operands @code{treeop0},\n\ and @code{treeop1} and producing a > vector > > +of type @var{vectype}. The division\n\ will then not be decomposed > > +by the and kept as a div.\n\ \n\ When the hook is being used to test > > +whether the target supports a special\n\ divide, @var{in0}, > > +@var{in1}, and @var{output} are all null. When the hook\n\ is being > > +used to emit a division, @var{in0} and @var{in1} are the source\n\ > > +vectors of type @var{vecttype} and @var{output} is the destination > > +vector of\n\ type @var{vectype}.\n\ \n\ Return true if the operation > > +is possible, emitting instructions for it\n\ if rtxes are provided > > +and updating @var{output}.", bool, (enum tree_code, tree vectype, > > +tree treeop0, tree treeop1, rtx *output, > > + rtx in0, rtx in1), > > + default_can_special_div_by_const) > > + > > /* Return true if the target supports misaligned store/load of a > > specific factor denoted in the third parameter. The last parameter > > is true if the access is defined in a packed struct. */ diff > > --git a/gcc/target.h b/gcc/target.h index > > > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56 > > f39c061f68b665 100644 > > --- a/gcc/target.h > > +++ b/gcc/target.h > > @@ -51,6 +51,7 @@ > > #include "insn-codes.h" > > #include "tm.h" > > #include "hard-reg-set.h" > > +#include "tree-core.h" > > > > #if CHECKING_P > > > > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index > > > ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e > > 2640d63f936b336d 100644 > > --- a/gcc/targhooks.h > > +++ b/gcc/targhooks.h > > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage > > (addr_space_t, location_t); extern rtx default_addr_space_convert > > (rtx, tree, tree); extern unsigned int default_case_values_threshold > > (void); extern bool default_have_conditional_execution (void); > > +extern bool default_can_special_div_by_const (enum tree_code, tree, > > tree, tree, > > + rtx *, rtx, rtx); > > > > extern bool default_libc_has_function (enum function_class, tree); > > extern bool default_libc_has_fast_function (int fcode); diff --git > > a/gcc/targhooks.cc b/gcc/targhooks.cc index > > > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241 > > 279936ced41ee95 100644 > > --- a/gcc/targhooks.cc > > +++ b/gcc/targhooks.cc > > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void) > > return HAVE_conditional_execution; > > } > > > > +/* Default that no division by constant operations are special. */ > > +bool default_can_special_div_by_const (enum tree_code, tree, tree, > > +tree, rtx *, rtx, > > + rtx) > > +{ > > + return false; > > +} > > + > > /* By default we assume that c99 functions are present at the runtime, > > but sincos is not. */ > > bool > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3 > > d7b4d5b64a19b9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > @@ -0,0 +1,25 @@ > > +/* { dg-require-effective-target vect_int } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint8_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3 > > db75b3e4112e2cc > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > @@ -0,0 +1,25 @@ > > +/* { dg-require-effective-target vect_int } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint16_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720 > > 157701d9d1cf852 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > @@ -0,0 +1,26 @@ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-additional-options "-fno-vect-cost-model" { target > > +aarch64*-*-* } } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint32_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1 > > 832f28ebd07993e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > @@ -0,0 +1,43 @@ > > +#include <stdio.h> > > + > > +#ifndef N > > +#define N 65 > > +#endif > > + > > +#ifndef TYPE > > +#define TYPE uint32_t > > +#endif > > + > > +#ifndef DEBUG > > +#define DEBUG 0 > > +#endif > > + > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4) > > + > > +int main () > > +{ > > + TYPE a[N]; > > + TYPE b[N]; > > + > > + for (int i = 0; i < N; ++i) > > + { > > + a[i] = BASE + i * 13; > > + b[i] = BASE + i * 13; > > + if (DEBUG) > > + printf ("%d: 0x%x\n", i, a[i]); > > + } > > + > > + fun1 (a, N / 2, N); > > + fun2 (b, N / 2, N); > > + > > + for (int i = 0; i < N; ++i) > > + { > > + if (DEBUG) > > + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); > > + > > + if (a[i] != b[i]) > > + __builtin_abort (); > > + } > > + return 0; > > +} > > + > > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44a > > b211cd246d82d5 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > @@ -0,0 +1,61 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-O3 -std=c99" } */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } > > +} */ > > + > > +#include <stdint.h> > > + > > +#pragma GCC target "+nosve" > > + > > +/* > > +** draw_bitmap1: > > +** ... > > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > > +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b > > +** ... > > +*/ > > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xfe; } > > + > > +/* > > +** draw_bitmap3: > > +** ... > > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > > +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > > +** ... > > +*/ > > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +/* > > +** draw_bitmap4: > > +** ... > > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > > +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > > +** ... > > +*/ > > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > > > 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c > > 9a12046b6ec94f3 100644 > > --- a/gcc/tree-vect-generic.cc > > +++ b/gcc/tree-vect-generic.cc > > @@ -1237,6 +1237,14 @@ expand_vector_operation > (gimple_stmt_iterator > > *gsi, tree type, tree compute_type > > tree rhs2 = gimple_assign_rhs2 (assign); > > tree ret; > > > > + /* Check if the target was going to handle it through the special > > + division callback hook. */ > > + if (targetm.vectorize.can_special_div_by_const (code, type, rhs1, > > + rhs2, NULL, > > + NULL_RTX, > > NULL_RTX)) > > + return NULL_TREE; > > + > > + > > if (!optimize > > || !VECTOR_INTEGER_TYPE_P (type) > > || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect- > > patterns.cc b/gcc/tree-vect-patterns.cc index > > > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af > > 0b1bfea10fe443 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo, > > > > return pattern_stmt; > > } > > + else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype, > > + oprnd0, oprnd1, NULL, > > + NULL_RTX, NULL_RTX)) > > + { > > + return NULL; > > + } > > > > if (prec > HOST_BITS_PER_WIDE_INT > > || integer_zerop (oprnd1)) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index > > > c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd6 > > 8e0e1c1e93faafe 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo, > > } > > target_support_p = (optab_handler (optab, vec_mode) > > != CODE_FOR_nothing); > > + if (!target_support_p) > > + target_support_p > > + = targetm.vectorize.can_special_div_by_const (code, vectype, > > + op0, op1, NULL, > > + NULL_RTX, > > NULL_RTX); > > } > > > > bool using_emulated_vectors_p = vect_emulated_vector_p (vectype); > > > > > > > > > > --
Hi Tamar, > -----Original Message----- > From: Tamar Christina <Tamar.Christina@arm.com> > Sent: Monday, October 31, 2022 11:35 AM > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov > <Kyrylo.Tkachov@arm.com>; Richard Sandiford > <Richard.Sandiford@arm.com> > Subject: RE: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask > division. > > Hi All, > > Ping, and updated patch based on mid-end changes. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md > (@aarch64_bitmask_udiv<mode>3): New. > * config/aarch64/aarch64.cc > (aarch64_vectorize_can_special_div_by_constant): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/div-by-bitmask.c: New test. > > --- inline copy of patch --- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f > 0ba6386c1ab50f77e 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -4831,6 +4831,65 @@ (define_expand > "aarch64_<sur><addsub>hn2<mode>" > } > ) Some editorial comments. > > +;; div optimizations using narrowings > +;; we can do the division e.g. shorts by 255 faster by calculating it as > +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in > +;; double the precision of x. > +;; > +;; If we imagine a short as being composed of two blocks of bytes then > +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to Typo "equivalent" > +;; adding 1 to each sub component: > +;; > +;; short value of 16-bits > +;; ┌──────────────┬────────────────┐ > +;; │ │ │ > +;; └──────────────┴────────────────┘ > +;; 8-bit part1 ▲ 8-bit part2 ▲ > +;; │ │ > +;; │ │ > +;; +1 +1 > +;; > +;; after the first addition, we have to shift right by 8, and narrow the > +;; results back to a byte. Remember that the addition must be done in > +;; double the precision of the input. Since 8 is half the size of a short > +;; we can use a narrowing halfing instruction in AArch64, addhn which also > +;; does the addition in a wider precision and narrows back to a byte. The > +;; shift itself is implicit in the operation as it writes back only the top > +;; half of the result. i.e. bits 2*esize-1:esize. > +;; > +;; Since we have narrowed the result of the first part back to a byte, for > +;; the second addition we can use a widening addition, uaddw. > +;; > +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8 "final shift", "unsigned arithmetic" > +;; to shift and the vectorizer. Incomplete sentence? > +;; > +;; The shift is later optimized by combine to a uzp2 with movi #0. > +(define_expand "@aarch64_bitmask_udiv<mode>3" > + [(match_operand:VQN 0 "register_operand") > + (match_operand:VQN 1 "register_operand") > + (match_operand:VQN 2 "immediate_operand")] > + "TARGET_SIMD" > +{ > + unsigned HOST_WIDE_INT size > + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > + if (!CONST_VECTOR_P (operands[2]) > + || const_vector_encoded_nelts (operands[2]) != 1 > + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) > + FAIL; > + > + rtx addend = gen_reg_rtx (<MODE>mode); > + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); > + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > <VNARROWQ2>mode)); > + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > bitsize); > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); > + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > shift_vector)); > + DONE; > +}) Does all this work for big-endian too? I think it does, but wonder whether you've tested. Ok if so, with the comments addressed. Thanks, Kyrill > + > ;; pmul. > > (define_insn "aarch64_pmul<mode>" > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > index > 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..d3c3650d7d728f56adb651 > 54127dc7b72386c5a7 100644 > --- a/gcc/config/aarch64/aarch64.cc > +++ b/gcc/config/aarch64/aarch64.cc > @@ -24146,6 +24146,40 @@ aarch64_vectorize_vec_perm_const > (machine_mode vmode, machine_mode op_mode, > return ret; > } > > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > + > +bool > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > + tree vectype, wide_int cst, > + rtx *output, rtx in0, rtx in1) > +{ > + if (code != TRUNC_DIV_EXPR > + || !TYPE_UNSIGNED (vectype)) > + return false; > + > + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE > (vectype)); > + if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > + return false; > + > + if (in0 == NULL_RTX && in1 == NULL_RTX) > + { > + wide_int val = wi::add (cst, 1); > + int pow = wi::exact_log2 (val); > + return pow == (int)(element_precision (vectype) / 2); > + } > + > + if (!VECTOR_TYPE_P (vectype)) > + return false; > + > + gcc_assert (output); > + > + if (!*output) > + *output = gen_reg_rtx (TYPE_MODE (vectype)); > + > + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, > in0, in1)); > + return true; > +} > + > /* Generate a byte permute mask for a register of mode MODE, > which has NUNITS units. */ > > @@ -27606,6 +27640,10 @@ aarch64_libgcc_floating_mode_supported_p > #undef TARGET_VECTOR_ALIGNMENT > #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment > > +#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > +#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \ > + aarch64_vectorize_can_special_div_by_constant > + > #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT > #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ > aarch64_vectorize_preferred_vector_alignment > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf > 44ab211cd246d82d5 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > @@ -0,0 +1,61 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -std=c99" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +#include <stdint.h> > + > +#pragma GCC target "+nosve" > + > +/* > +** draw_bitmap1: > +** ... > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b > +** ... > +*/ > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) > +{ > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xff; > +} > + > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) > +{ > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xfe; > +} > + > +/* > +** draw_bitmap3: > +** ... > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > +** ... > +*/ > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) > +{ > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * level) / 0xffffU; > +} > + > +/* > +** draw_bitmap4: > +** ... > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** ... > +*/ > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) > +{ > + for (int i = 0; i < (n & -16); i+=1) > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; > +} > > > -----Original Message----- > > From: Tamar Christina <tamar.christina@arm.com> > > Sent: Friday, September 23, 2022 10:34 AM > > To: gcc-patches@gcc.gnu.org > > Cc: nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>; > > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov > > <Kyrylo.Tkachov@arm.com>; Richard Sandiford > > <Richard.Sandiford@arm.com> > > Subject: [PATCH 2/4]AArch64 Add implementation for pow2 bitmask > division. > > > > Hi All, > > > > This adds an implementation for the new optab for unsigned pow2 bitmask > > for AArch64. > > > > The implementation rewrites: > > > > x = y / (2 ^ (sizeof (y)/2)-1 > > > > into e.g. (for bytes) > > > > (x + ((x + 257) >> 8)) >> 8 > > > > where it's required that the additions be done in double the precision of x > > such that we don't lose any bits during an overflow. > > > > Essentially the sequence decomposes the division into doing two smaller > > divisions, one for the top and bottom parts of the number and adding the > > results back together. > > > > To account for the fact that shift by 8 would be division by 256 we add 1 to > > both parts of x such that when 255 we still get 1 as the answer. > > > > Because the amount we shift are half the original datatype we can use the > > halfing instructions the ISA provides to do the operation instead of using > > actual shifts. > > > > For AArch64 this means we generate for: > > > > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > > for (int i = 0; i < (n & -16); i+=1) > > pixel[i] = (pixel[i] * level) / 0xff; } > > > > the following: > > > > movi v3.16b, 0x1 > > umull2 v1.8h, v0.16b, v2.16b > > umull v0.8h, v0.8b, v2.8b > > addhn v5.8b, v1.8h, v3.8h > > addhn v4.8b, v0.8h, v3.8h > > uaddw v1.8h, v1.8h, v5.8b > > uaddw v0.8h, v0.8h, v4.8b > > uzp2 v0.16b, v0.16b, v1.16b > > > > instead of: > > > > umull v2.8h, v1.8b, v5.8b > > umull2 v1.8h, v1.16b, v5.16b > > umull v0.4s, v2.4h, v3.4h > > umull2 v2.4s, v2.8h, v3.8h > > umull v4.4s, v1.4h, v3.4h > > umull2 v1.4s, v1.8h, v3.8h > > uzp2 v0.8h, v0.8h, v2.8h > > uzp2 v1.8h, v4.8h, v1.8h > > shrn v0.8b, v0.8h, 7 > > shrn2 v0.16b, v1.8h, 7 > > > > Which results in significantly faster code. > > > > Thanks for Wilco for the concept. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-simd.md > > (@aarch64_bitmask_udiv<mode>3): New. > > * config/aarch64/aarch64.cc > > (aarch64_vectorize_can_special_div_by_constant): New. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/div-by-bitmask.c: New test. > > > > --- inline copy of patch -- > > diff --git a/gcc/config/aarch64/aarch64-simd.md > > b/gcc/config/aarch64/aarch64-simd.md > > index > > > 587a45d77721e1b39accbad7dbeca4d741eccb10..f4152160084d6b6f34bd69f > 0b > > a6386c1ab50f77e 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -4831,6 +4831,65 @@ (define_expand > > "aarch64_<sur><addsub>hn2<mode>" > > } > > ) > > > > +;; div optimizations using narrowings > > +;; we can do the division e.g. shorts by 255 faster by calculating it > > +as ;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in ;; > > +double the precision of x. > > +;; > > +;; If we imagine a short as being composed of two blocks of bytes then > > +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to ;; > > +adding 1 to each sub component: > > +;; > > +;; short value of 16-bits > > +;; ┌──────────────┬────────────────┐ > > +;; │ │ │ > > +;; └──────────────┴────────────────┘ > > +;; 8-bit part1 ▲ 8-bit part2 ▲ > > +;; │ │ > > +;; │ │ > > +;; +1 +1 > > +;; > > +;; after the first addition, we have to shift right by 8, and narrow > > +the ;; results back to a byte. Remember that the addition must be done > > +in ;; double the precision of the input. Since 8 is half the size of a > > +short ;; we can use a narrowing halfing instruction in AArch64, addhn > > +which also ;; does the addition in a wider precision and narrows back > > +to a byte. The ;; shift itself is implicit in the operation as it > > +writes back only the top ;; half of the result. i.e. bits 2*esize-1:esize. > > +;; > > +;; Since we have narrowed the result of the first part back to a byte, > > +for ;; the second addition we can use a widening addition, uaddw. > > +;; > > +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr > > +by 8 ;; to shift and the vectorizer. > > +;; > > +;; The shift is later optimized by combine to a uzp2 with movi #0. > > +(define_expand "@aarch64_bitmask_udiv<mode>3" > > + [(match_operand:VQN 0 "register_operand") > > + (match_operand:VQN 1 "register_operand") > > + (match_operand:VQN 2 "immediate_operand")] > > + "TARGET_SIMD" > > +{ > > + unsigned HOST_WIDE_INT size > > + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; > > + if (!CONST_VECTOR_P (operands[2]) > > + || const_vector_encoded_nelts (operands[2]) != 1 > > + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) > > + FAIL; > > + > > + rtx addend = gen_reg_rtx (<MODE>mode); > > + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, > 1); > > + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, > > +<VNARROWQ2>mode)); > > + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); > > + rtx tmp2 = gen_reg_rtx (<MODE>mode); > > + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); > > + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); > > + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, > > +bitsize); > > + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], > tmp1)); > > + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, > > +shift_vector)); > > + DONE; > > +}) > > + > > ;; pmul. > > > > (define_insn "aarch64_pmul<mode>" > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index > > > 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc > 3 > > 7484b6fc6901bfb4 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const > > (machine_mode vmode, machine_mode op_mode, > > return ret; > > } > > > > +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ > > + > > +bool > > +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, > > + tree vectype, > > + tree treeop0, tree treeop1, > > + rtx *output, rtx in0, rtx in1) { > > + > > + if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX)) > > + return false; > > + > > + tree cst = uniform_integer_cst_p (treeop1); tree type; if (code != > > + TRUNC_DIV_EXPR > > + || !cst > > + || !TYPE_UNSIGNED ((type = TREE_TYPE (cst))) > > + || tree_int_cst_sgn (cst) != 1) > > + return false; > > + > > + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE > > + (vectype)); if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) > > + return false; > > + > > + if (in0 == NULL_RTX && in1 == NULL_RTX) > > + { > > + gcc_assert (treeop0 && treeop1); > > + wide_int icst = wi::to_wide (cst); > > + wide_int val = wi::add (icst, 1); > > + int pow = wi::exact_log2 (val); > > + return pow == (TYPE_PRECISION (type) / 2); > > + } > > + > > + if (!VECTOR_TYPE_P (vectype)) > > + return false; > > + > > + gcc_assert (output); > > + > > + if (!*output) > > + *output = gen_reg_rtx (TYPE_MODE (vectype)); > > + > > + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, > > +in0, in1)); > > + return true; > > +} > > + > > /* Generate a byte permute mask for a register of mode MODE, > > which has NUNITS units. */ > > > > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index > > > 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d > 2 > > 44a2a23e76cac097 100644 > > --- a/gcc/doc/tm.texi > > +++ b/gcc/doc/tm.texi > > @@ -6112,6 +6112,22 @@ instruction pattern. There is no need for the > hook > > to handle these two implementation approaches itself. > > @end deftypefn > > > > +@deftypefn {Target Hook} bool > > TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > +(enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree > > +@var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) This > > +hook is used to test whether the target has a special method of > > +division of vectors of type @var{vectype} using the two operands > > @code{treeop0}, and @code{treeop1} and producing a vector of type > > @var{vectype}. The division will then not be decomposed by the and kept > as > > a div. > > + > > +When the hook is being used to test whether the target supports a > > +special divide, @var{in0}, @var{in1}, and @var{output} are all null. > > +When the hook is being used to emit a division, @var{in0} and @var{in1} > > +are the source vectors of type @var{vecttype} and @var{output} is the > > +destination vector of type @var{vectype}. > > + > > +Return true if the operation is possible, emitting instructions for it > > +if rtxes are provided and updating @var{output}. > > +@end deftypefn > > + > > @deftypefn {Target Hook} tree > > TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned > > @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in}) This hook > > should return the decl of a function that implements the vectorized variant > > of the function with the @code{combined_fn} code diff --git > > a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index > > > 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b > 04 > > 076d058c24ce093 100644 > > --- a/gcc/doc/tm.texi.in > > +++ b/gcc/doc/tm.texi.in > > @@ -4164,6 +4164,8 @@ address; but often a machine-dependent > strategy > > can generate better code. > > > > @hook TARGET_VECTORIZE_VEC_PERM_CONST > > > > +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST > > + > > @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION > > > > @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION > > diff --git a/gcc/explow.cc b/gcc/explow.cc index > > > ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae67 > 8f > > 5e346bf34ba0036 100644 > > --- a/gcc/explow.cc > > +++ b/gcc/explow.cc > > @@ -1037,7 +1037,7 @@ round_push (rtx size) > > TRUNC_DIV_EXPR. */ > > size = expand_binop (Pmode, add_optab, size, alignm1_rtx, > > NULL_RTX, 1, OPTAB_LIB_WIDEN); > > - size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx, > > + size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, > > + align_rtx, > > NULL_RTX, 1); > > size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1); > > > > @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned > > required_align) > > gen_int_mode (required_align / BITS_PER_UNIT - 1, > > Pmode), > > NULL_RTX, 1, OPTAB_LIB_WIDEN); > > - target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target, > > + target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, > > target, > > gen_int_mode (required_align / BITS_PER_UNIT, > > Pmode), > > NULL_RTX, 1); > > diff --git a/gcc/expmed.h b/gcc/expmed.h index > > > 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501 > c6 > > f33cb3595659b5 100644 > > --- a/gcc/expmed.h > > +++ b/gcc/expmed.h > > @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, > > machine_mode, extern rtx expand_shift (enum tree_code, > machine_mode, > > rtx, poly_int64, rtx, > > int); > > #ifdef GCC_OPTABS_H > > -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx, > > - rtx, int, enum optab_methods = > > OPTAB_LIB_WIDEN); > > +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, > > tree, > > + rtx, rtx, rtx, int, > > + enum optab_methods = OPTAB_LIB_WIDEN); > > #endif > > #endif > > > > diff --git a/gcc/expmed.cc b/gcc/expmed.cc index > > > 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb > 09 > > 90db8b97d3af414 100644 > > --- a/gcc/expmed.cc > > +++ b/gcc/expmed.cc > > @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx > > op0, HOST_WIDE_INT d) > > > > rtx > > expand_divmod (int rem_flag, enum tree_code code, machine_mode > > mode, > > - rtx op0, rtx op1, rtx target, int unsignedp, > > - enum optab_methods methods) > > + tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target, > > + int unsignedp, enum optab_methods methods) > > { > > machine_mode compute_mode; > > rtx tquotient; > > @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > > > last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0; > > > > + /* Check if the target has specific expansions for the division. */ > > + if (treeop0 > > + && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE > > (treeop0), > > + treeop0, treeop1, > > + &target, op0, op1)) > > + return target; > > + > > + > > /* Now convert to the best mode to use. */ > > if (compute_mode != mode) > > { > > @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > || (optab_handler (sdivmod_optab, int_mode) > > != CODE_FOR_nothing))) > > quotient = expand_divmod (0, TRUNC_DIV_EXPR, > > - int_mode, op0, > > - gen_int_mode (abs_d, > > + int_mode, treeop0, treeop1, > > + op0, gen_int_mode (abs_d, > > int_mode), > > NULL_RTX, 0); > > else > > @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code > > code, machine_mode mode, > > size - 1, NULL_RTX, 0); > > t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign), > > NULL_RTX); > > - t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, > > op1, > > - NULL_RTX, 0); > > + t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, > > treeop0, > > + treeop1, t3, op1, NULL_RTX, 0); > > if (t4) > > { > > rtx t5; > > diff --git a/gcc/expr.cc b/gcc/expr.cc > > index > > > 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd > 96 > > a8abc055fa34d9 100644 > > --- a/gcc/expr.cc > > +++ b/gcc/expr.cc > > @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target) > > return expand_divmod (0, > > FLOAT_MODE_P (GET_MODE (value)) > > ? RDIV_EXPR : TRUNC_DIV_EXPR, > > - GET_MODE (value), op1, op2, target, 0); > > + GET_MODE (value), NULL, NULL, op1, op2, > > + target, 0); > > case MOD: > > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > op1, op2, > > - target, 0); > > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 0); > > case UDIV: > > - return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > > op1, op2, > > - target, 1); > > + return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 1); > > case UMOD: > > - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > op1, op2, > > - target, 1); > > + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), > > NULL, NULL, > > + op1, op2, target, 1); > > case ASHIFTRT: > > return expand_simple_binop (GET_MODE (value), code, op1, op2, > > target, 0, OPTAB_LIB_WIDEN); > > @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, > > machine_mode mode, tree treeop0, > > bool speed_p = optimize_insn_for_speed_p (); > > do_pending_stack_adjust (); > > start_sequence (); > > - rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, > 1); > > + rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, 1); > > rtx_insn *uns_insns = get_insns (); > > end_sequence (); > > start_sequence (); > > - rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, > 0); > > + rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, 0); > > rtx_insn *sgn_insns = get_insns (); > > end_sequence (); > > unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 > +9019,8 > > @@ expand_expr_divmod (tree_code code, machine_mode mode, tree > > treeop0, > > emit_insn (sgn_insns); > > return sgn_ret; > > } > > - return expand_divmod (mod_p, code, mode, op0, op1, target, > unsignedp); > > + return expand_divmod (mod_p, code, mode, treeop0, treeop1, > > + op0, op1, target, unsignedp); > > } > > > > rtx > > diff --git a/gcc/optabs.cc b/gcc/optabs.cc index > > > 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abf > d > > 872f340855dc96 100644 > > --- a/gcc/optabs.cc > > +++ b/gcc/optabs.cc > > @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode > mode, > > rtx op0, rtx op1, bool unsignedp) > > return NULL_RTX; > > } > > } > > - rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > > sum, > > - gen_int_mode (INTVAL (op1), > > word_mode), > > + rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, > > NULL, NULL, > > + sum, gen_int_mode (INTVAL (op1), > > + word_mode), > > NULL_RTX, 1, OPTAB_DIRECT); > > if (remainder == NULL_RTX) > > return NULL_RTX; > > @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode > > mode, rtx op0, rtx op1, rtx *rem, > > > > if (op11 != const1_rtx) > > { > > - rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11, > > - NULL_RTX, unsignedp, OPTAB_DIRECT); > > + rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, > > quot1, > > + op11, NULL_RTX, unsignedp, > > OPTAB_DIRECT); > > if (rem2 == NULL_RTX) > > return NULL_RTX; > > > > @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode > > mode, rtx op0, rtx op1, rtx *rem, > > if (rem2 == NULL_RTX) > > return NULL_RTX; > > > > - rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11, > > - NULL_RTX, unsignedp, OPTAB_DIRECT); > > + rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, > > quot1, > > + op11, NULL_RTX, unsignedp, > > OPTAB_DIRECT); > > if (quot2 == NULL_RTX) > > return NULL_RTX; > > > > diff --git a/gcc/target.def b/gcc/target.def index > > > 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b0 > 70 > > 81cdd70113db9b1 100644 > > --- a/gcc/target.def > > +++ b/gcc/target.def > > @@ -1902,6 +1902,25 @@ implementation approaches itself.", > > const vec_perm_indices &sel), > > NULL) > > > > +DEFHOOK > > +(can_special_div_by_const, > > + "This hook is used to test whether the target has a special method > > +of\n\ division of vectors of type @var{vectype} using the two operands > > +@code{treeop0},\n\ and @code{treeop1} and producing a vector of type > > +@var{vectype}. The division\n\ will then not be decomposed by the and > > +kept as a div.\n\ \n\ When the hook is being used to test whether the > > +target supports a special\n\ divide, @var{in0}, @var{in1}, and > > +@var{output} are all null. When the hook\n\ is being used to emit a > > +division, @var{in0} and @var{in1} are the source\n\ vectors of type > > +@var{vecttype} and @var{output} is the destination vector of\n\ type > > +@var{vectype}.\n\ \n\ Return true if the operation is possible, > > +emitting instructions for it\n\ if rtxes are provided and updating > > +@var{output}.", bool, (enum tree_code, tree vectype, tree treeop0, > > +tree treeop1, rtx *output, > > + rtx in0, rtx in1), > > + default_can_special_div_by_const) > > + > > /* Return true if the target supports misaligned store/load of a > > specific factor denoted in the third parameter. The last parameter > > is true if the access is defined in a packed struct. */ diff --git > a/gcc/target.h > > b/gcc/target.h index > > > d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da > 56 > > f39c061f68b665 100644 > > --- a/gcc/target.h > > +++ b/gcc/target.h > > @@ -51,6 +51,7 @@ > > #include "insn-codes.h" > > #include "tm.h" > > #include "hard-reg-set.h" > > +#include "tree-core.h" > > > > #if CHECKING_P > > > > diff --git a/gcc/targhooks.h b/gcc/targhooks.h index > > > ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e > > 2640d63f936b336d 100644 > > --- a/gcc/targhooks.h > > +++ b/gcc/targhooks.h > > @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage > > (addr_space_t, location_t); extern rtx default_addr_space_convert (rtx, > > tree, tree); extern unsigned int default_case_values_threshold (void); > > extern bool default_have_conditional_execution (void); > > +extern bool default_can_special_div_by_const (enum tree_code, tree, > > tree, tree, > > + rtx *, rtx, rtx); > > > > extern bool default_libc_has_function (enum function_class, tree); extern > > bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc > > b/gcc/targhooks.cc index > > > b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba2 > 41 > > 279936ced41ee95 100644 > > --- a/gcc/targhooks.cc > > +++ b/gcc/targhooks.cc > > @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void) > > return HAVE_conditional_execution; > > } > > > > +/* Default that no division by constant operations are special. */ > > +bool default_can_special_div_by_const (enum tree_code, tree, tree, > > +tree, rtx *, rtx, > > + rtx) > > +{ > > + return false; > > +} > > + > > /* By default we assume that c99 functions are present at the runtime, > > but sincos is not. */ > > bool > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b491 > 6f3 > > d7b4d5b64a19b9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c > > @@ -0,0 +1,25 @@ > > +/* { dg-require-effective-target vect_int } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint8_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..e904a71885b2e8487593a2c > d3 > > db75b3e4112e2cc > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c > > @@ -0,0 +1,25 @@ > > +/* { dg-require-effective-target vect_int } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint16_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e7 > 20 > > 157701d9d1cf852 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c > > @@ -0,0 +1,26 @@ > > +/* { dg-require-effective-target vect_int } */ > > +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* > > +} } */ > > + > > +#include <stdint.h> > > +#include "tree-vect.h" > > + > > +#define N 50 > > +#define TYPE uint32_t > > + > > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > + > > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE* > > +restrict pixel, TYPE level, int n) { > > + for (int i = 0; i < n; i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > + > > +#include "vect-div-bitmask.h" > > + > > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: > > +detected" "vect" { target aarch64*-*-* } } } */ > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..29a16739aa4b706616367bf > d1 > > 832f28ebd07993e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h > > @@ -0,0 +1,43 @@ > > +#include <stdio.h> > > + > > +#ifndef N > > +#define N 65 > > +#endif > > + > > +#ifndef TYPE > > +#define TYPE uint32_t > > +#endif > > + > > +#ifndef DEBUG > > +#define DEBUG 0 > > +#endif > > + > > +#define BASE ((TYPE) -1 < 0 ? -126 : 4) > > + > > +int main () > > +{ > > + TYPE a[N]; > > + TYPE b[N]; > > + > > + for (int i = 0; i < N; ++i) > > + { > > + a[i] = BASE + i * 13; > > + b[i] = BASE + i * 13; > > + if (DEBUG) > > + printf ("%d: 0x%x\n", i, a[i]); > > + } > > + > > + fun1 (a, N / 2, N); > > + fun2 (b, N / 2, N); > > + > > + for (int i = 0; i < N; ++i) > > + { > > + if (DEBUG) > > + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); > > + > > + if (a[i] != b[i]) > > + __builtin_abort (); > > + } > > + return 0; > > +} > > + > > diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > new file mode 100644 > > index > > > 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf > 44a > > b211cd246d82d5 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c > > @@ -0,0 +1,61 @@ > > +/* { dg-do compile } */ > > +/* { dg-additional-options "-O3 -std=c99" } */ > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } > > +*/ > > + > > +#include <stdint.h> > > + > > +#pragma GCC target "+nosve" > > + > > +/* > > +** draw_bitmap1: > > +** ... > > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > > +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h > > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > > +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b > > +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b > > +** ... > > +*/ > > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xff; } > > + > > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xfe; } > > + > > +/* > > +** draw_bitmap3: > > +** ... > > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > > +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s > > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > > +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h > > +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h > > +** ... > > +*/ > > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * level) / 0xffffU; } > > + > > +/* > > +** draw_bitmap4: > > +** ... > > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > > +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d > > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > > +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s > > +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > > +** ... > > +*/ > > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) { > > + for (int i = 0; i < (n & -16); i+=1) > > + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; } > > diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index > > > 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817 > c > > 9a12046b6ec94f3 100644 > > --- a/gcc/tree-vect-generic.cc > > +++ b/gcc/tree-vect-generic.cc > > @@ -1237,6 +1237,14 @@ expand_vector_operation > (gimple_stmt_iterator > > *gsi, tree type, tree compute_type > > tree rhs2 = gimple_assign_rhs2 (assign); > > tree ret; > > > > + /* Check if the target was going to handle it through the special > > + division callback hook. */ > > + if (targetm.vectorize.can_special_div_by_const (code, type, rhs1, > > + rhs2, NULL, > > + NULL_RTX, > > NULL_RTX)) > > + return NULL_TREE; > > + > > + > > if (!optimize > > || !VECTOR_INTEGER_TYPE_P (type) > > || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect- > > patterns.cc b/gcc/tree-vect-patterns.cc index > > > 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85 > af > > 0b1bfea10fe443 100644 > > --- a/gcc/tree-vect-patterns.cc > > +++ b/gcc/tree-vect-patterns.cc > > @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo, > > > > return pattern_stmt; > > } > > + else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype, > > + oprnd0, oprnd1, NULL, > > + NULL_RTX, NULL_RTX)) > > + { > > + return NULL; > > + } > > > > if (prec > HOST_BITS_PER_WIDE_INT > > || integer_zerop (oprnd1)) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index > > > c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288b > d6 > > 8e0e1c1e93faafe 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo, > > } > > target_support_p = (optab_handler (optab, vec_mode) > > != CODE_FOR_nothing); > > + if (!target_support_p) > > + target_support_p > > + = targetm.vectorize.can_special_div_by_const (code, vectype, > > + op0, op1, NULL, > > + NULL_RTX, > > NULL_RTX); > > } > > > > bool using_emulated_vectors_p = vect_emulated_vector_p (vectype); > > > > > > > > > > --
--- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4831,6 +4831,65 @@ (define_expand "aarch64_<sur><addsub>hn2<mode>" } ) +;; div optimizations using narrowings +;; we can do the division e.g. shorts by 255 faster by calculating it as +;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in +;; double the precision of x. +;; +;; If we imagine a short as being composed of two blocks of bytes then +;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalen to +;; adding 1 to each sub component: +;; +;; short value of 16-bits +;; ┌──────────────┬────────────────┐ +;; │ │ │ +;; └──────────────┴────────────────┘ +;; 8-bit part1 ▲ 8-bit part2 ▲ +;; │ │ +;; │ │ +;; +1 +1 +;; +;; after the first addition, we have to shift right by 8, and narrow the +;; results back to a byte. Remember that the addition must be done in +;; double the precision of the input. Since 8 is half the size of a short +;; we can use a narrowing halfing instruction in AArch64, addhn which also +;; does the addition in a wider precision and narrows back to a byte. The +;; shift itself is implicit in the operation as it writes back only the top +;; half of the result. i.e. bits 2*esize-1:esize. +;; +;; Since we have narrowed the result of the first part back to a byte, for +;; the second addition we can use a widening addition, uaddw. +;; +;; For the finaly shift, since it's unsigned arithmatic we emit an ushr by 8 +;; to shift and the vectorizer. +;; +;; The shift is later optimized by combine to a uzp2 with movi #0. +(define_expand "@aarch64_bitmask_udiv<mode>3" + [(match_operand:VQN 0 "register_operand") + (match_operand:VQN 1 "register_operand") + (match_operand:VQN 2 "immediate_operand")] + "TARGET_SIMD" +{ + unsigned HOST_WIDE_INT size + = (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1; + if (!CONST_VECTOR_P (operands[2]) + || const_vector_encoded_nelts (operands[2]) != 1 + || size != UINTVAL (CONST_VECTOR_ELT (operands[2], 0))) + FAIL; + + rtx addend = gen_reg_rtx (<MODE>mode); + rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1); + emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode)); + rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode); + rtx tmp2 = gen_reg_rtx (<MODE>mode); + emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend)); + unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode); + rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize); + emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1)); + emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector)); + DONE; +}) + ;; pmul. (define_insn "aarch64_pmul<mode>" diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 4b486aeea90ea2afb9cdd96a4dbe15c5bb2abd7a..91bb7d306f36dc4c9eeaafc37484b6fc6901bfb4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -24146,6 +24146,51 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, return ret; } +/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */ + +bool +aarch64_vectorize_can_special_div_by_constant (enum tree_code code, + tree vectype, + tree treeop0, tree treeop1, + rtx *output, rtx in0, rtx in1) +{ + + if ((!treeop0 || !treeop1) && (in0 == NULL_RTX || in1 == NULL_RTX)) + return false; + + tree cst = uniform_integer_cst_p (treeop1); + tree type; + if (code != TRUNC_DIV_EXPR + || !cst + || !TYPE_UNSIGNED ((type = TREE_TYPE (cst))) + || tree_int_cst_sgn (cst) != 1) + return false; + + unsigned int flags = aarch64_classify_vector_mode (TYPE_MODE (vectype)); + if ((flags & VEC_ANY_SVE) && !TARGET_SVE2) + return false; + + if (in0 == NULL_RTX && in1 == NULL_RTX) + { + gcc_assert (treeop0 && treeop1); + wide_int icst = wi::to_wide (cst); + wide_int val = wi::add (icst, 1); + int pow = wi::exact_log2 (val); + return pow == (TYPE_PRECISION (type) / 2); + } + + if (!VECTOR_TYPE_P (vectype)) + return false; + + gcc_assert (output); + + if (!*output) + *output = gen_reg_rtx (TYPE_MODE (vectype)); + + emit_insn (gen_aarch64_bitmask_udiv3 (TYPE_MODE (vectype), *output, in0, in1)); + return true; +} + /* Generate a byte permute mask for a register of mode MODE, which has NUNITS units. */ diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 92bda1a7e14a3c9ea63e151e4a49a818bf4d1bdb..adba9fe97a9b43729c5e86d244a2a23e76cac097 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6112,6 +6112,22 @@ instruction pattern. There is no need for the hook to handle these two implementation approaches itself. @end deftypefn +@deftypefn {Target Hook} bool TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST (enum @var{tree_code}, tree @var{vectype}, tree @var{treeop0}, tree @var{treeop1}, rtx *@var{output}, rtx @var{in0}, rtx @var{in1}) +This hook is used to test whether the target has a special method of +division of vectors of type @var{vectype} using the two operands @code{treeop0}, +and @code{treeop1} and producing a vector of type @var{vectype}. The division +will then not be decomposed by the and kept as a div. + +When the hook is being used to test whether the target supports a special +divide, @var{in0}, @var{in1}, and @var{output} are all null. When the hook +is being used to emit a division, @var{in0} and @var{in1} are the source +vectors of type @var{vecttype} and @var{output} is the destination vector of +type @var{vectype}. + +Return true if the operation is possible, emitting instructions for it +if rtxes are provided and updating @var{output}. +@end deftypefn + @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in}) This hook should return the decl of a function that implements the vectorized variant of the function with the @code{combined_fn} code diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 112462310b134705d860153294287cfd7d4af81d..d5a745a02acdf051ea1da1b04076d058c24ce093 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4164,6 +4164,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_VEC_PERM_CONST +@hook TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST + @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION @hook TARGET_VECTORIZE_BUILTIN_MD_VECTORIZED_FUNCTION diff --git a/gcc/explow.cc b/gcc/explow.cc index ddb4d6ae3600542f8d2bb5617cdd3933a9fae6c0..568e0eb1a158c696458ae678f5e346bf34ba0036 100644 --- a/gcc/explow.cc +++ b/gcc/explow.cc @@ -1037,7 +1037,7 @@ round_push (rtx size) TRUNC_DIV_EXPR. */ size = expand_binop (Pmode, add_optab, size, alignm1_rtx, NULL_RTX, 1, OPTAB_LIB_WIDEN); - size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx, + size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, size, align_rtx, NULL_RTX, 1); size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1); @@ -1203,7 +1203,7 @@ align_dynamic_address (rtx target, unsigned required_align) gen_int_mode (required_align / BITS_PER_UNIT - 1, Pmode), NULL_RTX, 1, OPTAB_LIB_WIDEN); - target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, target, + target = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, NULL, NULL, target, gen_int_mode (required_align / BITS_PER_UNIT, Pmode), NULL_RTX, 1); diff --git a/gcc/expmed.h b/gcc/expmed.h index 0b2538c4c6bd51dfdc772ef70bdf631c0bed8717..0db2986f11ff4a4b10b59501c6f33cb3595659b5 100644 --- a/gcc/expmed.h +++ b/gcc/expmed.h @@ -708,8 +708,9 @@ extern rtx expand_variable_shift (enum tree_code, machine_mode, extern rtx expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int); #ifdef GCC_OPTABS_H -extern rtx expand_divmod (int, enum tree_code, machine_mode, rtx, rtx, - rtx, int, enum optab_methods = OPTAB_LIB_WIDEN); +extern rtx expand_divmod (int, enum tree_code, machine_mode, tree, tree, + rtx, rtx, rtx, int, + enum optab_methods = OPTAB_LIB_WIDEN); #endif #endif diff --git a/gcc/expmed.cc b/gcc/expmed.cc index 8d7418be418406e72a895ecddf2dc7fdb950c76c..b64ea5ac46a9da85770a5bb0990db8b97d3af414 100644 --- a/gcc/expmed.cc +++ b/gcc/expmed.cc @@ -4222,8 +4222,8 @@ expand_sdiv_pow2 (scalar_int_mode mode, rtx op0, HOST_WIDE_INT d) rtx expand_divmod (int rem_flag, enum tree_code code, machine_mode mode, - rtx op0, rtx op1, rtx target, int unsignedp, - enum optab_methods methods) + tree treeop0, tree treeop1, rtx op0, rtx op1, rtx target, + int unsignedp, enum optab_methods methods) { machine_mode compute_mode; rtx tquotient; @@ -4375,6 +4375,14 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode, last_div_const = ! rem_flag && op1_is_constant ? INTVAL (op1) : 0; + /* Check if the target has specific expansions for the division. */ + if (treeop0 + && targetm.vectorize.can_special_div_by_const (code, TREE_TYPE (treeop0), + treeop0, treeop1, + &target, op0, op1)) + return target; + + /* Now convert to the best mode to use. */ if (compute_mode != mode) { @@ -4618,8 +4626,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode, || (optab_handler (sdivmod_optab, int_mode) != CODE_FOR_nothing))) quotient = expand_divmod (0, TRUNC_DIV_EXPR, - int_mode, op0, - gen_int_mode (abs_d, + int_mode, treeop0, treeop1, + op0, gen_int_mode (abs_d, int_mode), NULL_RTX, 0); else @@ -4808,8 +4816,8 @@ expand_divmod (int rem_flag, enum tree_code code, machine_mode mode, size - 1, NULL_RTX, 0); t3 = force_operand (gen_rtx_MINUS (int_mode, t1, nsign), NULL_RTX); - t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, t3, op1, - NULL_RTX, 0); + t4 = expand_divmod (0, TRUNC_DIV_EXPR, int_mode, treeop0, + treeop1, t3, op1, NULL_RTX, 0); if (t4) { rtx t5; diff --git a/gcc/expr.cc b/gcc/expr.cc index 80bb1b8a4c5b8350fb1b8f57a99fd52e5882fcb6..b786f1d75e25f3410c0640cd96a8abc055fa34d9 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc @@ -8028,16 +8028,17 @@ force_operand (rtx value, rtx target) return expand_divmod (0, FLOAT_MODE_P (GET_MODE (value)) ? RDIV_EXPR : TRUNC_DIV_EXPR, - GET_MODE (value), op1, op2, target, 0); + GET_MODE (value), NULL, NULL, op1, op2, + target, 0); case MOD: - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2, - target, 0); + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL, + op1, op2, target, 0); case UDIV: - return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), op1, op2, - target, 1); + return expand_divmod (0, TRUNC_DIV_EXPR, GET_MODE (value), NULL, NULL, + op1, op2, target, 1); case UMOD: - return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), op1, op2, - target, 1); + return expand_divmod (1, TRUNC_MOD_EXPR, GET_MODE (value), NULL, NULL, + op1, op2, target, 1); case ASHIFTRT: return expand_simple_binop (GET_MODE (value), code, op1, op2, target, 0, OPTAB_LIB_WIDEN); @@ -8990,11 +8991,13 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0, bool speed_p = optimize_insn_for_speed_p (); do_pending_stack_adjust (); start_sequence (); - rtx uns_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 1); + rtx uns_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, + op0, op1, target, 1); rtx_insn *uns_insns = get_insns (); end_sequence (); start_sequence (); - rtx sgn_ret = expand_divmod (mod_p, code, mode, op0, op1, target, 0); + rtx sgn_ret = expand_divmod (mod_p, code, mode, treeop0, treeop1, + op0, op1, target, 0); rtx_insn *sgn_insns = get_insns (); end_sequence (); unsigned uns_cost = seq_cost (uns_insns, speed_p); @@ -9016,7 +9019,8 @@ expand_expr_divmod (tree_code code, machine_mode mode, tree treeop0, emit_insn (sgn_insns); return sgn_ret; } - return expand_divmod (mod_p, code, mode, op0, op1, target, unsignedp); + return expand_divmod (mod_p, code, mode, treeop0, treeop1, + op0, op1, target, unsignedp); } rtx diff --git a/gcc/optabs.cc b/gcc/optabs.cc index 165f8d1fa22432b96967c69a58dbb7b4bf18120d..cff37ccb0dfc3dd79b97d0abfd872f340855dc96 100644 --- a/gcc/optabs.cc +++ b/gcc/optabs.cc @@ -1104,8 +1104,9 @@ expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp) return NULL_RTX; } } - rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum, - gen_int_mode (INTVAL (op1), word_mode), + rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, NULL, NULL, + sum, gen_int_mode (INTVAL (op1), + word_mode), NULL_RTX, 1, OPTAB_DIRECT); if (remainder == NULL_RTX) return NULL_RTX; @@ -1208,8 +1209,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem, if (op11 != const1_rtx) { - rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, quot1, op11, - NULL_RTX, unsignedp, OPTAB_DIRECT); + rtx rem2 = expand_divmod (1, TRUNC_MOD_EXPR, mode, NULL, NULL, quot1, + op11, NULL_RTX, unsignedp, OPTAB_DIRECT); if (rem2 == NULL_RTX) return NULL_RTX; @@ -1223,8 +1224,8 @@ expand_doubleword_divmod (machine_mode mode, rtx op0, rtx op1, rtx *rem, if (rem2 == NULL_RTX) return NULL_RTX; - rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, quot1, op11, - NULL_RTX, unsignedp, OPTAB_DIRECT); + rtx quot2 = expand_divmod (0, TRUNC_DIV_EXPR, mode, NULL, NULL, quot1, + op11, NULL_RTX, unsignedp, OPTAB_DIRECT); if (quot2 == NULL_RTX) return NULL_RTX; diff --git a/gcc/target.def b/gcc/target.def index 2a7fa68f83dd15dcdd2c332e8431e6142ec7d305..92ebd2af18fe8abb6ed95b07081cdd70113db9b1 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1902,6 +1902,25 @@ implementation approaches itself.", const vec_perm_indices &sel), NULL) +DEFHOOK +(can_special_div_by_const, + "This hook is used to test whether the target has a special method of\n\ +division of vectors of type @var{vectype} using the two operands @code{treeop0},\n\ +and @code{treeop1} and producing a vector of type @var{vectype}. The division\n\ +will then not be decomposed by the and kept as a div.\n\ +\n\ +When the hook is being used to test whether the target supports a special\n\ +divide, @var{in0}, @var{in1}, and @var{output} are all null. When the hook\n\ +is being used to emit a division, @var{in0} and @var{in1} are the source\n\ +vectors of type @var{vecttype} and @var{output} is the destination vector of\n\ +type @var{vectype}.\n\ +\n\ +Return true if the operation is possible, emitting instructions for it\n\ +if rtxes are provided and updating @var{output}.", + bool, (enum tree_code, tree vectype, tree treeop0, tree treeop1, rtx *output, + rtx in0, rtx in1), + default_can_special_div_by_const) + /* Return true if the target supports misaligned store/load of a specific factor denoted in the third parameter. The last parameter is true if the access is defined in a packed struct. */ diff --git a/gcc/target.h b/gcc/target.h index d6fa6931499d15edff3e5af3e429540d001c7058..c836036ac7fa7910d62bd3da56f39c061f68b665 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -51,6 +51,7 @@ #include "insn-codes.h" #include "tm.h" #include "hard-reg-set.h" +#include "tree-core.h" #if CHECKING_P diff --git a/gcc/targhooks.h b/gcc/targhooks.h index ecce55ebe797cedc940620e8d89816973a045d49..42451a3e22e86fee9da2f56e2640d63f936b336d 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -207,6 +207,8 @@ extern void default_addr_space_diagnose_usage (addr_space_t, location_t); extern rtx default_addr_space_convert (rtx, tree, tree); extern unsigned int default_case_values_threshold (void); extern bool default_have_conditional_execution (void); +extern bool default_can_special_div_by_const (enum tree_code, tree, tree, tree, + rtx *, rtx, rtx); extern bool default_libc_has_function (enum function_class, tree); extern bool default_libc_has_fast_function (int fcode); diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc index b15ae19bcb60c59ae8112e67b5f06a241a9bdbf1..8206533382611a7640efba241279936ced41ee95 100644 --- a/gcc/targhooks.cc +++ b/gcc/targhooks.cc @@ -1807,6 +1807,14 @@ default_have_conditional_execution (void) return HAVE_conditional_execution; } +/* Default that no division by constant operations are special. */ +bool +default_can_special_div_by_const (enum tree_code, tree, tree, tree, rtx *, rtx, + rtx) +{ + return false; +} + /* By default we assume that c99 functions are present at the runtime, but sincos is not. */ bool diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c new file mode 100644 index 0000000000000000000000000000000000000000..472cd710534bc8aa9b1b4916f3d7b4d5b64a19b9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-1.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdint.h> +#include "tree-vect.h" + +#define N 50 +#define TYPE uint8_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c new file mode 100644 index 0000000000000000000000000000000000000000..e904a71885b2e8487593a2cd3db75b3e4112e2cc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-2.c @@ -0,0 +1,25 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdint.h> +#include "tree-vect.h" + +#define N 50 +#define TYPE uint16_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c new file mode 100644 index 0000000000000000000000000000000000000000..a1418ebbf5ea8731ed4e3e720157701d9d1cf852 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-3.c @@ -0,0 +1,26 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */ + +#include <stdint.h> +#include "tree-vect.h" + +#define N 50 +#define TYPE uint32_t + +__attribute__((noipa, noinline, optimize("O1"))) +void fun1(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} + +__attribute__((noipa, noinline, optimize("O3"))) +void fun2(TYPE* restrict pixel, TYPE level, int n) +{ + for (int i = 0; i < n; i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} + +#include "vect-div-bitmask.h" + +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h new file mode 100644 index 0000000000000000000000000000000000000000..29a16739aa4b706616367bfd1832f28ebd07993e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask.h @@ -0,0 +1,43 @@ +#include <stdio.h> + +#ifndef N +#define N 65 +#endif + +#ifndef TYPE +#define TYPE uint32_t +#endif + +#ifndef DEBUG +#define DEBUG 0 +#endif + +#define BASE ((TYPE) -1 < 0 ? -126 : 4) + +int main () +{ + TYPE a[N]; + TYPE b[N]; + + for (int i = 0; i < N; ++i) + { + a[i] = BASE + i * 13; + b[i] = BASE + i * 13; + if (DEBUG) + printf ("%d: 0x%x\n", i, a[i]); + } + + fun1 (a, N / 2, N); + fun2 (b, N / 2, N); + + for (int i = 0; i < N; ++i) + { + if (DEBUG) + printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]); + + if (a[i] != b[i]) + __builtin_abort (); + } + return 0; +} + diff --git a/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c new file mode 100644 index 0000000000000000000000000000000000000000..2a535791ba7258302e0c2cf44ab211cd246d82d5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/div-by-bitmask.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -std=c99" } */ +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ + +#include <stdint.h> + +#pragma GCC target "+nosve" + +/* +** draw_bitmap1: +** ... +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** addhn v[0-9]+.8b, v[0-9]+.8h, v[0-9]+.8h +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uaddw v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8b +** uzp2 v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b +** ... +*/ +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xff; +} + +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xfe; +} + +/* +** draw_bitmap3: +** ... +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** addhn v[0-9]+.4h, v[0-9]+.4s, v[0-9]+.4s +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uaddw v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h +** uzp2 v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h +** ... +*/ +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * level) / 0xffffU; +} + +/* +** draw_bitmap4: +** ... +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** addhn v[0-9]+.2s, v[0-9]+.2d, v[0-9]+.2d +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uaddw v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2s +** uzp2 v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s +** ... +*/ +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) +{ + for (int i = 0; i < (n & -16); i+=1) + pixel[i] = (pixel[i] * (uint64_t)level) / 0xffffffffUL; +} diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc index 350129555a0c71c0896c4f1003163f3b3557c11b..ebee5e24b186915ebcb3a817c9a12046b6ec94f3 100644 --- a/gcc/tree-vect-generic.cc +++ b/gcc/tree-vect-generic.cc @@ -1237,6 +1237,14 @@ expand_vector_operation (gimple_stmt_iterator *gsi, tree type, tree compute_type tree rhs2 = gimple_assign_rhs2 (assign); tree ret; + /* Check if the target was going to handle it through the special + division callback hook. */ + if (targetm.vectorize.can_special_div_by_const (code, type, rhs1, + rhs2, NULL, + NULL_RTX, NULL_RTX)) + return NULL_TREE; + + if (!optimize || !VECTOR_INTEGER_TYPE_P (type) || TREE_CODE (rhs2) != VECTOR_CST diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 09574bb1a2696b3438a4ce9f09f74b42e784aca0..607acdf95eb30335d8bc0e85af0b1bfea10fe443 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -3596,6 +3596,12 @@ vect_recog_divmod_pattern (vec_info *vinfo, return pattern_stmt; } + else if (targetm.vectorize.can_special_div_by_const (rhs_code, vectype, + oprnd0, oprnd1, NULL, + NULL_RTX, NULL_RTX)) + { + return NULL; + } if (prec > HOST_BITS_PER_WIDE_INT || integer_zerop (oprnd1)) diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index c9dab217f059f17e91e9a7582523e627d7a45b66..6d05c48a7339de094d7288bd68e0e1c1e93faafe 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -6260,6 +6260,11 @@ vectorizable_operation (vec_info *vinfo, } target_support_p = (optab_handler (optab, vec_mode) != CODE_FOR_nothing); + if (!target_support_p) + target_support_p + = targetm.vectorize.can_special_div_by_const (code, vectype, + op0, op1, NULL, + NULL_RTX, NULL_RTX); } bool using_emulated_vectors_p = vect_emulated_vector_p (vectype);