diff mbox series

[PR113816] AArch64: Use SIMD+GPR for logical vector reductions

Message ID A4713418-FDC8-4D1C-893C-F36F89C5EA08@nvidia.com
State New
Headers show
Series [PR113816] AArch64: Use SIMD+GPR for logical vector reductions | expand

Commit Message

Jennifer Schmitz Oct. 10, 2024, 8:27 a.m. UTC
This patch implements the optabs reduc_and_scal_<mode>,
reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
vector reduction operations.
Previously, either only vector registers or only general purpose registers (GPR)
were used. Now, vector registers are used for the reduction from 128 to 64 bits;
64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are used
for the rest of the reduction steps.

For example, the test case (V8HI)
int16_t foo (int16_t *a)
{
  int16_t b = -1;
  for (int i = 0; i < 8; ++i)
    b &= a[i];
  return b;
}

was previously compiled to (-O2):
foo:
	ldr     q0, [x0]
	movi    v30.4s, 0
	ext     v29.16b, v0.16b, v30.16b, #8
	and     v29.16b, v29.16b, v0.16b
	ext     v31.16b, v29.16b, v30.16b, #4
	and     v31.16b, v31.16b, v29.16b
	ext     v30.16b, v31.16b, v30.16b, #2
	and     v30.16b, v30.16b, v31.16b
	umov    w0, v30.h[0]
	ret

With patch, it is compiled to:
foo:
	ldr     q31, [x0]
	ext     v30.16b, v31.16b, v31.16b, #8
	and     v31.8b, v30.8b, v31.8b
	fmov    x0, d31
	and     x0, x0, x0, lsr 32
	and     w0, w0, w0, lsr 16
	ret

For modes V4SI and V2DI, the pattern was not implemented, because the
current codegen (using only base instructions) is already efficient.

Note that the PR initially suggested to use SVE reduction ops. However,
they have higher latency than the proposed sequence, which is why using
neon and base instructions is preferable.

Test cases were added for 8/16-bit integers for all implemented modes and all
three operations to check the produced assembly.

We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
because for aarch64 vector types, either the logical reduction optabs are
implemented or the codegen for reduction operations is good as it is.
This was motivated by failure of a scan-tree-dump directive in the test cases
gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	PR target/113816
	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
	Implement for logical bitwise operations for VDQV_E.

gcc/testsuite/
	PR target/113816
	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
	* gcc.target/aarch64/simd/logical_reduc.c: New test.
	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
---
 gcc/config/aarch64/aarch64-simd.md            |  55 +++++
 .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
 .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
 gcc/testsuite/lib/target-supports.exp         |   4 +-
 4 files changed, 267 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c

Comments

Tamar Christina Oct. 10, 2024, 6:31 p.m. UTC | #1
Hi Jennifer,

> -----Original Message-----
> From: Jennifer Schmitz <jschmitz@nvidia.com>
> Sent: Thursday, October 10, 2024 9:27 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Kyrylo Tkachov <ktkachov@nvidia.com>; Tamar
> Christina <Tamar.Christina@arm.com>
> Subject: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> reductions
> 
> This patch implements the optabs reduc_and_scal_<mode>,
> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
> vector reduction operations.
> Previously, either only vector registers or only general purpose registers (GPR)
> were used. Now, vector registers are used for the reduction from 128 to 64 bits;
> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are used
> for the rest of the reduction steps.
> 
> For example, the test case (V8HI)
> int16_t foo (int16_t *a)
> {
>   int16_t b = -1;
>   for (int i = 0; i < 8; ++i)
>     b &= a[i];
>   return b;
> }
> 
> was previously compiled to (-O2):
> foo:
> 	ldr     q0, [x0]
> 	movi    v30.4s, 0
> 	ext     v29.16b, v0.16b, v30.16b, #8
> 	and     v29.16b, v29.16b, v0.16b
> 	ext     v31.16b, v29.16b, v30.16b, #4
> 	and     v31.16b, v31.16b, v29.16b
> 	ext     v30.16b, v31.16b, v30.16b, #2
> 	and     v30.16b, v30.16b, v31.16b
> 	umov    w0, v30.h[0]
> 	ret
> 
> With patch, it is compiled to:
> foo:
> 	ldr     q31, [x0]
> 	ext     v30.16b, v31.16b, v31.16b, #8
> 	and     v31.8b, v30.8b, v31.8b
> 	fmov    x0, d31
> 	and     x0, x0, x0, lsr 32
> 	and     w0, w0, w0, lsr 16
> 	ret
> 
> For modes V4SI and V2DI, the pattern was not implemented, because the
> current codegen (using only base instructions) is already efficient.
> 
> Note that the PR initially suggested to use SVE reduction ops. However,
> they have higher latency than the proposed sequence, which is why using
> neon and base instructions is preferable.
> 
> Test cases were added for 8/16-bit integers for all implemented modes and all
> three operations to check the produced assembly.
> 
> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> because for aarch64 vector types, either the logical reduction optabs are
> implemented or the codegen for reduction operations is good as it is.
> This was motivated by failure of a scan-tree-dump directive in the test cases
> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> 
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
> 
> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> 
> gcc/
> 	PR target/113816
> 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> 	Implement for logical bitwise operations for VDQV_E.
> 
> gcc/testsuite/
> 	PR target/113816
> 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>  gcc/testsuite/lib/target-supports.exp         |   4 +-
>  4 files changed, 267 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-
> simd.md
> index 23c03a96371..00286b8b020 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3608,6 +3608,61 @@
>    }
>  )
> 
> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> +;; on the two halves of the input.
> +;; If the input has 128 bits, the first operation is performed in vector
> +;; registers.  From 64 bits down, the reduction steps are performed in general
> +;; purpose registers.
> +;; For example, for V8HI and operation AND, the intended sequence is:
> +;; EXT      v1.16b, v0.16b, v0.16b, #8
> +;; AND      v0.8b, v1.8b, v0.8b
> +;; FMOV     x0, d0
> +;; AND      x0, x0, x0, 32
> +;; AND      w0, w0, w0, 16
> +;;
> +;; For V8QI and operation AND, the sequence is:
> +;; AND      x0, x0, x0, lsr 32
> +;; AND      w0, w0, w0, lsr, 16
> +;; AND      w0, w0, w0, lsr, 8
> +
> +(define_expand "reduc_<optab>_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> +  "TARGET_SIMD"
> +  {
> +    rtx dst = operands[1];
> +    rtx tdi = gen_reg_rtx (DImode);
> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> +    rtx op1_lo;
> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> +      {
> +	rtx t0 = gen_reg_rtx (<MODE>mode);
> +	rtx t1 = gen_reg_rtx (DImode);
> +	rtx t2 = gen_reg_rtx (DImode);
> +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> +	emit_move_insn (tdi, t1);
> +      }
> +    else
> +      {
> +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> +	emit_move_insn (tdi, op1_lo);
> +      }
> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> +    DONE;
> +  }
> +)

Thanks! These look great!
If I could make a suggestion, since they are logical operators you can still perform
the operations at the wider mode and truncate after.  This allows you to cut down
on the number of temporaries.  So how about something like this instead:

(define_expand "reduc_<optab>_scal_<mode>"
 [(match_operand:<VEL> 0 "register_operand")
  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
  "TARGET_SIMD"
  {
    rtx dst = operands[1];
    rtx tdi = gen_reg_rtx (DImode);
    if (<bitsize> == 128)
      {
	rtx t0 = gen_reg_rtx (<MODE>mode);
	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst,
					  GEN_INT (<nunits> / 2)));
	emit_insn (gen_<optab><mode>3 (dst, dst, t0));
      }
    emit_move_insn (tdi, lowpart_subreg (DImode, dst, <MODE>mode));
    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
    DONE;
  }
)

I also prefer the iterators here as the code compiled from the pattern will
elide the dead cases at compile time.  Using this you'll have to update the
tests in logical_reduc.c.

Thanks,
Tamar 

>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
>   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
>         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> new file mode 100644
> index 00000000000..9508288b218
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> @@ -0,0 +1,208 @@
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** fv16qi_and:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	and	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 16; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_and:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_or:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	orr	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_or:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_xor:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	eor	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_xor:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_and:
> +**	ldr	x0, \[x0\]
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	and	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_and:
> +**	ldr	x0, \[x0\]
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 4; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_or:
> +**	ldr	x0, \[x0\]
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	orr	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_or:
> +**	ldr	x0, \[x0\]
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_xor:
> +**	ldr	x0, \[x0\]
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	eor	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_xor:
> +**	ldr	x0, \[x0\]
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> index 918822a7d00..70c4ca18094 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
>    return 0;
>  }
> 
> -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index 8f2afe866c7..44f737f15d0 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
>  		   || [istarget amdgcn-*-*]
>  		   || [check_effective_target_riscv_v]
>  		   || [check_effective_target_loongarch_sx]
> -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> +		   || [istarget i?86-*-*]
> +		   || [istarget x86_64-*-*]
> +		   || [istarget aarch64*-*-*]}]
>  }
> 
>  # Return 1 if the target supports the fold_extract_last optab.
> --
> 2.44.0
Richard Sandiford Oct. 10, 2024, 7:07 p.m. UTC | #2
Jennifer Schmitz <jschmitz@nvidia.com> writes:
> This patch implements the optabs reduc_and_scal_<mode>,
> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
> vector reduction operations.
> Previously, either only vector registers or only general purpose registers (GPR)
> were used. Now, vector registers are used for the reduction from 128 to 64 bits;
> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are used
> for the rest of the reduction steps.
>
> For example, the test case (V8HI)
> int16_t foo (int16_t *a)
> {
>   int16_t b = -1;
>   for (int i = 0; i < 8; ++i)
>     b &= a[i];
>   return b;
> }
>
> was previously compiled to (-O2):
> foo:
> 	ldr     q0, [x0]
> 	movi    v30.4s, 0
> 	ext     v29.16b, v0.16b, v30.16b, #8
> 	and     v29.16b, v29.16b, v0.16b
> 	ext     v31.16b, v29.16b, v30.16b, #4
> 	and     v31.16b, v31.16b, v29.16b
> 	ext     v30.16b, v31.16b, v30.16b, #2
> 	and     v30.16b, v30.16b, v31.16b
> 	umov    w0, v30.h[0]
> 	ret
>
> With patch, it is compiled to:
> foo:
> 	ldr     q31, [x0]
> 	ext     v30.16b, v31.16b, v31.16b, #8
> 	and     v31.8b, v30.8b, v31.8b
> 	fmov    x0, d31
> 	and     x0, x0, x0, lsr 32
> 	and     w0, w0, w0, lsr 16
> 	ret
>
> For modes V4SI and V2DI, the pattern was not implemented, because the
> current codegen (using only base instructions) is already efficient.
>
> Note that the PR initially suggested to use SVE reduction ops. However,
> they have higher latency than the proposed sequence, which is why using
> neon and base instructions is preferable.
>
> Test cases were added for 8/16-bit integers for all implemented modes and all
> three operations to check the produced assembly.
>
> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> because for aarch64 vector types, either the logical reduction optabs are
> implemented or the codegen for reduction operations is good as it is.
> This was motivated by failure of a scan-tree-dump directive in the test cases
> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?
>
> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>
> gcc/
> 	PR target/113816
> 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> 	Implement for logical bitwise operations for VDQV_E.
>
> gcc/testsuite/
> 	PR target/113816
> 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>  gcc/testsuite/lib/target-supports.exp         |   4 +-
>  4 files changed, 267 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 23c03a96371..00286b8b020 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3608,6 +3608,61 @@
>    }
>  )
>  
> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> +;; on the two halves of the input.
> +;; If the input has 128 bits, the first operation is performed in vector
> +;; registers.  From 64 bits down, the reduction steps are performed in general
> +;; purpose registers.
> +;; For example, for V8HI and operation AND, the intended sequence is:
> +;; EXT      v1.16b, v0.16b, v0.16b, #8
> +;; AND      v0.8b, v1.8b, v0.8b
> +;; FMOV     x0, d0
> +;; AND      x0, x0, x0, 32
> +;; AND      w0, w0, w0, 16
> +;;
> +;; For V8QI and operation AND, the sequence is:
> +;; AND      x0, x0, x0, lsr 32
> +;; AND      w0, w0, w0, lsr, 16
> +;; AND      w0, w0, w0, lsr, 8
> +
> +(define_expand "reduc_<optab>_scal_<mode>"
> + [(match_operand:<VEL> 0 "register_operand")
> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> +  "TARGET_SIMD"
> +  {
> +    rtx dst = operands[1];
> +    rtx tdi = gen_reg_rtx (DImode);
> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> +    rtx op1_lo;
> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> +      {
> +	rtx t0 = gen_reg_rtx (<MODE>mode);
> +	rtx t1 = gen_reg_rtx (DImode);
> +	rtx t2 = gen_reg_rtx (DImode);
> +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> +	emit_move_insn (tdi, t1);
> +      }
> +    else
> +      {
> +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> +	emit_move_insn (tdi, op1_lo);
> +      }
> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> +    DONE;
> +  }
> +)

Sorry to be awkward, but I've got mixed feelings about this.
The sequence produced looks good.  But we're not defining this
optab because the target has a special instruction for implementing
the operation.  We're just implementing it to tweak the loop emitted
by vect_create_epilog_for_reduction.  Maybe we should change that loop
instead, perhaps with a target hook?

Also, this seems related to the fact that:

typedef unsigned char v8qi __attribute__((vector_size(8)));
unsigned char foo(v8qi x) {
  x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
  return x[0];
}

generates:

        ext     v31.8b, v0.8b, v0.8b, #1
        and     v31.8b, v31.8b, v0.8b
        umov    x0, v31.d[0]
        ret

instead of something like:

        umov    w0, v31.h[0]
        and     w0, w0, lsr #8
        ret

So an alternative might be to do some post-vectorisation simplification,
perhaps in isel?

cc:ing Richi for his thoughts.

Thanks,
Richard

> +
>  (define_insn "aarch64_reduc_<optab>_internal<mode>"
>   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
>         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> new file mode 100644
> index 00000000000..9508288b218
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> @@ -0,0 +1,208 @@
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** fv16qi_and:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	and	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 16; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_and:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_or:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	orr	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_or:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv16qi_xor:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	eor	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv16qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 16; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8hi_xor:
> +**	ldr	q([0-9]+), \[x0\]
> +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> +**	fmov	x0, d\1
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv8hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_and:
> +**	ldr	x0, \[x0\]
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	and	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_and (int8_t *a)
> +{
> +  int8_t b = -1;
> +  for (int i = 0; i < 8; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_and:
> +**	ldr	x0, \[x0\]
> +**	and	x0, x0, x0, lsr 32
> +**	and	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_and (int16_t *a)
> +{
> +  int16_t b = -1;
> +  for (int i = 0; i < 4; ++i)
> +    b &= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_or:
> +**	ldr	x0, \[x0\]
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	orr	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_or (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_or:
> +**	ldr	x0, \[x0\]
> +**	orr	x0, x0, x0, lsr 32
> +**	orr	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_or (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b |= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv8qi_xor:
> +**	ldr	x0, \[x0\]
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	eor	w0, w0, w0, lsr 8
> +**	ret
> +*/
> +int8_t fv8qi_xor (int8_t *a)
> +{
> +  int8_t b = 0;
> +  for (int i = 0; i < 8; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> +
> +/*
> +** fv4hi_xor:
> +**	ldr	x0, \[x0\]
> +**	eor	x0, x0, x0, lsr 32
> +**	eor	w0, w0, w0, lsr 16
> +**	ret
> +*/
> +int16_t fv4hi_xor (int16_t *a)
> +{
> +  int16_t b = 0;
> +  for (int i = 0; i < 4; ++i)
> +    b ^= a[i];
> +  return b;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> index 918822a7d00..70c4ca18094 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } } */
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index 8f2afe866c7..44f737f15d0 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
>  		   || [istarget amdgcn-*-*]
>  		   || [check_effective_target_riscv_v]
>  		   || [check_effective_target_loongarch_sx]
> -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> +		   || [istarget i?86-*-*]
> +		   || [istarget x86_64-*-*]
> +		   || [istarget aarch64*-*-*]}]
>  }
>  
>  # Return 1 if the target supports the fold_extract_last optab.
Tamar Christina Oct. 10, 2024, 7:18 p.m. UTC | #3
> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, October 10, 2024 8:08 PM
> To: Jennifer Schmitz <jschmitz@nvidia.com>
> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Kyrylo Tkachov <ktkachov@nvidia.com>; Tamar Christina
> <Tamar.Christina@arm.com>; rguenther@suse.de
> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> reductions
> 
> Jennifer Schmitz <jschmitz@nvidia.com> writes:
> > This patch implements the optabs reduc_and_scal_<mode>,
> > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
> > vector reduction operations.
> > Previously, either only vector registers or only general purpose registers (GPR)
> > were used. Now, vector registers are used for the reduction from 128 to 64 bits;
> > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
> used
> > for the rest of the reduction steps.
> >
> > For example, the test case (V8HI)
> > int16_t foo (int16_t *a)
> > {
> >   int16_t b = -1;
> >   for (int i = 0; i < 8; ++i)
> >     b &= a[i];
> >   return b;
> > }
> >
> > was previously compiled to (-O2):
> > foo:
> > 	ldr     q0, [x0]
> > 	movi    v30.4s, 0
> > 	ext     v29.16b, v0.16b, v30.16b, #8
> > 	and     v29.16b, v29.16b, v0.16b
> > 	ext     v31.16b, v29.16b, v30.16b, #4
> > 	and     v31.16b, v31.16b, v29.16b
> > 	ext     v30.16b, v31.16b, v30.16b, #2
> > 	and     v30.16b, v30.16b, v31.16b
> > 	umov    w0, v30.h[0]
> > 	ret
> >
> > With patch, it is compiled to:
> > foo:
> > 	ldr     q31, [x0]
> > 	ext     v30.16b, v31.16b, v31.16b, #8
> > 	and     v31.8b, v30.8b, v31.8b
> > 	fmov    x0, d31
> > 	and     x0, x0, x0, lsr 32
> > 	and     w0, w0, w0, lsr 16
> > 	ret
> >
> > For modes V4SI and V2DI, the pattern was not implemented, because the
> > current codegen (using only base instructions) is already efficient.
> >
> > Note that the PR initially suggested to use SVE reduction ops. However,
> > they have higher latency than the proposed sequence, which is why using
> > neon and base instructions is preferable.
> >
> > Test cases were added for 8/16-bit integers for all implemented modes and all
> > three operations to check the produced assembly.
> >
> > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> > because for aarch64 vector types, either the logical reduction optabs are
> > implemented or the codegen for reduction operations is good as it is.
> > This was motivated by failure of a scan-tree-dump directive in the test cases
> > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> >
> > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> > OK for mainline?
> >
> > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >
> > gcc/
> > 	PR target/113816
> > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> > 	Implement for logical bitwise operations for VDQV_E.
> >
> > gcc/testsuite/
> > 	PR target/113816
> > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > ---
> >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
> >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
> >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
> >  gcc/testsuite/lib/target-supports.exp         |   4 +-
> >  4 files changed, 267 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > index 23c03a96371..00286b8b020 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3608,6 +3608,61 @@
> >    }
> >  )
> >
> > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> > +;; on the two halves of the input.
> > +;; If the input has 128 bits, the first operation is performed in vector
> > +;; registers.  From 64 bits down, the reduction steps are performed in general
> > +;; purpose registers.
> > +;; For example, for V8HI and operation AND, the intended sequence is:
> > +;; EXT      v1.16b, v0.16b, v0.16b, #8
> > +;; AND      v0.8b, v1.8b, v0.8b
> > +;; FMOV     x0, d0
> > +;; AND      x0, x0, x0, 32
> > +;; AND      w0, w0, w0, 16
> > +;;
> > +;; For V8QI and operation AND, the sequence is:
> > +;; AND      x0, x0, x0, lsr 32
> > +;; AND      w0, w0, w0, lsr, 16
> > +;; AND      w0, w0, w0, lsr, 8
> > +
> > +(define_expand "reduc_<optab>_scal_<mode>"
> > + [(match_operand:<VEL> 0 "register_operand")
> > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> > +  "TARGET_SIMD"
> > +  {
> > +    rtx dst = operands[1];
> > +    rtx tdi = gen_reg_rtx (DImode);
> > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> > +    rtx op1_lo;
> > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> > +      {
> > +	rtx t0 = gen_reg_rtx (<MODE>mode);
> > +	rtx t1 = gen_reg_rtx (DImode);
> > +	rtx t2 = gen_reg_rtx (DImode);
> > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> > +	emit_move_insn (tdi, t1);
> > +      }
> > +    else
> > +      {
> > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> > +	emit_move_insn (tdi, op1_lo);
> > +      }
> > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> > +    DONE;
> > +  }
> > +)
> 
> Sorry to be awkward, but I've got mixed feelings about this.
> The sequence produced looks good.  But we're not defining this
> optab because the target has a special instruction for implementing
> the operation.  We're just implementing it to tweak the loop emitted
> by vect_create_epilog_for_reduction.  Maybe we should change that loop
> instead, perhaps with a target hook?
> 
> Also, this seems related to the fact that:
> 
> typedef unsigned char v8qi __attribute__((vector_size(8)));
> unsigned char foo(v8qi x) {
>   x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
>   return x[0];
> }
> 
> generates:
> 
>         ext     v31.8b, v0.8b, v0.8b, #1
>         and     v31.8b, v31.8b, v0.8b
>         umov    x0, v31.d[0]
>         ret
> 
> instead of something like:
> 
>         umov    w0, v31.h[0]
>         and     w0, w0, lsr #8
>         ret
> 
> So an alternative might be to do some post-vectorisation simplification,
> perhaps in isel?

I suppose you could change vect_create_epilog_for_reduction to rotate
the vector and do vf/2 reduction at a time using VEC_PERM_EXPR instead
of what the vectorizer does at the moment.

But aren't we quite close to an AArch64 register file specific thing here?

I'm not quite sure that generically this sequence makes sense, and a target
hook for this would be the same as an optab isn't it?

Cheers,
Tamar
> 
> cc:ing Richi for his thoughts.
> 
> Thanks,
> Richard
> 
> > +
> >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> >   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
> >         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > new file mode 100644
> > index 00000000000..9508288b218
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > @@ -0,0 +1,208 @@
> > +/* { dg-options "-O2 -ftree-vectorize" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** fv16qi_and:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	and	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_and (int8_t *a)
> > +{
> > +  int8_t b = -1;
> > +  for (int i = 0; i < 16; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_and:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_and (int16_t *a)
> > +{
> > +  int16_t b = -1;
> > +  for (int i = 0; i < 8; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv16qi_or:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	orr	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_or (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 16; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_or:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_or (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv16qi_xor:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	eor	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_xor (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 16; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_xor:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_xor (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_and:
> > +**	ldr	x0, \[x0\]
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	and	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_and (int8_t *a)
> > +{
> > +  int8_t b = -1;
> > +  for (int i = 0; i < 8; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_and:
> > +**	ldr	x0, \[x0\]
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_and (int16_t *a)
> > +{
> > +  int16_t b = -1;
> > +  for (int i = 0; i < 4; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_or:
> > +**	ldr	x0, \[x0\]
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	orr	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_or (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_or:
> > +**	ldr	x0, \[x0\]
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_or (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 4; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_xor:
> > +**	ldr	x0, \[x0\]
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	eor	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_xor (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_xor:
> > +**	ldr	x0, \[x0\]
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_xor (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 4; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > index 918822a7d00..70c4ca18094 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
> >    return 0;
> >  }
> >
> > -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> > +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } }
> */
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> > index 8f2afe866c7..44f737f15d0 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
> >  		   || [istarget amdgcn-*-*]
> >  		   || [check_effective_target_riscv_v]
> >  		   || [check_effective_target_loongarch_sx]
> > -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> > +		   || [istarget i?86-*-*]
> > +		   || [istarget x86_64-*-*]
> > +		   || [istarget aarch64*-*-*]}]
> >  }
> >
> >  # Return 1 if the target supports the fold_extract_last optab.
Richard Sandiford Oct. 10, 2024, 8:38 p.m. UTC | #4
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Thursday, October 10, 2024 8:08 PM
>> To: Jennifer Schmitz <jschmitz@nvidia.com>
>> Cc: gcc-patches@gcc.gnu.org; Richard Earnshaw <Richard.Earnshaw@arm.com>;
>> Kyrylo Tkachov <ktkachov@nvidia.com>; Tamar Christina
>> <Tamar.Christina@arm.com>; rguenther@suse.de
>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>> reductions
>> 
>> Jennifer Schmitz <jschmitz@nvidia.com> writes:
>> > This patch implements the optabs reduc_and_scal_<mode>,
>> > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
>> > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
>> > vector reduction operations.
>> > Previously, either only vector registers or only general purpose registers (GPR)
>> > were used. Now, vector registers are used for the reduction from 128 to 64 bits;
>> > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
>> used
>> > for the rest of the reduction steps.
>> >
>> > For example, the test case (V8HI)
>> > int16_t foo (int16_t *a)
>> > {
>> >   int16_t b = -1;
>> >   for (int i = 0; i < 8; ++i)
>> >     b &= a[i];
>> >   return b;
>> > }
>> >
>> > was previously compiled to (-O2):
>> > foo:
>> > 	ldr     q0, [x0]
>> > 	movi    v30.4s, 0
>> > 	ext     v29.16b, v0.16b, v30.16b, #8
>> > 	and     v29.16b, v29.16b, v0.16b
>> > 	ext     v31.16b, v29.16b, v30.16b, #4
>> > 	and     v31.16b, v31.16b, v29.16b
>> > 	ext     v30.16b, v31.16b, v30.16b, #2
>> > 	and     v30.16b, v30.16b, v31.16b
>> > 	umov    w0, v30.h[0]
>> > 	ret
>> >
>> > With patch, it is compiled to:
>> > foo:
>> > 	ldr     q31, [x0]
>> > 	ext     v30.16b, v31.16b, v31.16b, #8
>> > 	and     v31.8b, v30.8b, v31.8b
>> > 	fmov    x0, d31
>> > 	and     x0, x0, x0, lsr 32
>> > 	and     w0, w0, w0, lsr 16
>> > 	ret
>> >
>> > For modes V4SI and V2DI, the pattern was not implemented, because the
>> > current codegen (using only base instructions) is already efficient.
>> >
>> > Note that the PR initially suggested to use SVE reduction ops. However,
>> > they have higher latency than the proposed sequence, which is why using
>> > neon and base instructions is preferable.
>> >
>> > Test cases were added for 8/16-bit integers for all implemented modes and all
>> > three operations to check the produced assembly.
>> >
>> > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
>> > because for aarch64 vector types, either the logical reduction optabs are
>> > implemented or the codegen for reduction operations is good as it is.
>> > This was motivated by failure of a scan-tree-dump directive in the test cases
>> > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>> >
>> > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
>> > OK for mainline?
>> >
>> > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> >
>> > gcc/
>> > 	PR target/113816
>> > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>> > 	Implement for logical bitwise operations for VDQV_E.
>> >
>> > gcc/testsuite/
>> > 	PR target/113816
>> > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>> > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
>> > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
>> > ---
>> >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>> >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>> >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>> >  gcc/testsuite/lib/target-supports.exp         |   4 +-
>> >  4 files changed, 267 insertions(+), 2 deletions(-)
>> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>> >
>> > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> b/gcc/config/aarch64/aarch64-simd.md
>> > index 23c03a96371..00286b8b020 100644
>> > --- a/gcc/config/aarch64/aarch64-simd.md
>> > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > @@ -3608,6 +3608,61 @@
>> >    }
>> >  )
>> >
>> > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
>> > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
>> > +;; on the two halves of the input.
>> > +;; If the input has 128 bits, the first operation is performed in vector
>> > +;; registers.  From 64 bits down, the reduction steps are performed in general
>> > +;; purpose registers.
>> > +;; For example, for V8HI and operation AND, the intended sequence is:
>> > +;; EXT      v1.16b, v0.16b, v0.16b, #8
>> > +;; AND      v0.8b, v1.8b, v0.8b
>> > +;; FMOV     x0, d0
>> > +;; AND      x0, x0, x0, 32
>> > +;; AND      w0, w0, w0, 16
>> > +;;
>> > +;; For V8QI and operation AND, the sequence is:
>> > +;; AND      x0, x0, x0, lsr 32
>> > +;; AND      w0, w0, w0, lsr, 16
>> > +;; AND      w0, w0, w0, lsr, 8
>> > +
>> > +(define_expand "reduc_<optab>_scal_<mode>"
>> > + [(match_operand:<VEL> 0 "register_operand")
>> > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
>> > +  "TARGET_SIMD"
>> > +  {
>> > +    rtx dst = operands[1];
>> > +    rtx tdi = gen_reg_rtx (DImode);
>> > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
>> > +    rtx op1_lo;
>> > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
>> > +      {
>> > +	rtx t0 = gen_reg_rtx (<MODE>mode);
>> > +	rtx t1 = gen_reg_rtx (DImode);
>> > +	rtx t2 = gen_reg_rtx (DImode);
>> > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
>> > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
>> > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
>> > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
>> > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
>> > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
>> > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
>> > +	emit_move_insn (tdi, t1);
>> > +      }
>> > +    else
>> > +      {
>> > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
>> > +	emit_move_insn (tdi, op1_lo);
>> > +      }
>> > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
>> > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
>> > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
>> > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
>> > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
>> > +    DONE;
>> > +  }
>> > +)
>> 
>> Sorry to be awkward, but I've got mixed feelings about this.
>> The sequence produced looks good.  But we're not defining this
>> optab because the target has a special instruction for implementing
>> the operation.  We're just implementing it to tweak the loop emitted
>> by vect_create_epilog_for_reduction.  Maybe we should change that loop
>> instead, perhaps with a target hook?
>> 
>> Also, this seems related to the fact that:
>> 
>> typedef unsigned char v8qi __attribute__((vector_size(8)));
>> unsigned char foo(v8qi x) {
>>   x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
>>   return x[0];
>> }
>> 
>> generates:
>> 
>>         ext     v31.8b, v0.8b, v0.8b, #1
>>         and     v31.8b, v31.8b, v0.8b
>>         umov    x0, v31.d[0]
>>         ret
>> 
>> instead of something like:
>> 
>>         umov    w0, v31.h[0]
>>         and     w0, w0, lsr #8
>>         ret
>> 
>> So an alternative might be to do some post-vectorisation simplification,
>> perhaps in isel?
>
> I suppose you could change vect_create_epilog_for_reduction to rotate
> the vector and do vf/2 reduction at a time using VEC_PERM_EXPR instead
> of what the vectorizer does at the moment.
>
> But aren't we quite close to an AArch64 register file specific thing here?

Maybe.  But I think it's unlikely that AArch64 is the only target that
prefers to do logical ops on scalar integers rather than vectors.
E.g. wouldn't this also be better for AArch32?  But I think it would
be better for other architectures I've worked on as well.

> I'm not quite sure that generically this sequence makes sense, and a target
> hook for this would be the same as an optab isn't it?

I agree it would be clunky if the target hook is very specific to this
situation.  I suppose the challenge would be to work out what property
of the target we're trying to model.

But even so, the main advantage of doing this in gimple is that gimple
passes can then optimise the individual operations with surrounding code.
RTL passes aren't going to be as good at that (though they're still
better than nothing).

Thanks,
Richard
Richard Biener Oct. 11, 2024, 6:51 a.m. UTC | #5
On Thu, 10 Oct 2024, Richard Sandiford wrote:

> Jennifer Schmitz <jschmitz@nvidia.com> writes:
> > This patch implements the optabs reduc_and_scal_<mode>,
> > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise logical
> > vector reduction operations.
> > Previously, either only vector registers or only general purpose registers (GPR)
> > were used. Now, vector registers are used for the reduction from 128 to 64 bits;
> > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are used
> > for the rest of the reduction steps.
> >
> > For example, the test case (V8HI)
> > int16_t foo (int16_t *a)
> > {
> >   int16_t b = -1;
> >   for (int i = 0; i < 8; ++i)
> >     b &= a[i];
> >   return b;
> > }
> >
> > was previously compiled to (-O2):
> > foo:
> > 	ldr     q0, [x0]
> > 	movi    v30.4s, 0
> > 	ext     v29.16b, v0.16b, v30.16b, #8
> > 	and     v29.16b, v29.16b, v0.16b
> > 	ext     v31.16b, v29.16b, v30.16b, #4
> > 	and     v31.16b, v31.16b, v29.16b
> > 	ext     v30.16b, v31.16b, v30.16b, #2
> > 	and     v30.16b, v30.16b, v31.16b
> > 	umov    w0, v30.h[0]
> > 	ret
> >
> > With patch, it is compiled to:
> > foo:
> > 	ldr     q31, [x0]
> > 	ext     v30.16b, v31.16b, v31.16b, #8
> > 	and     v31.8b, v30.8b, v31.8b
> > 	fmov    x0, d31
> > 	and     x0, x0, x0, lsr 32
> > 	and     w0, w0, w0, lsr 16
> > 	ret
> >
> > For modes V4SI and V2DI, the pattern was not implemented, because the
> > current codegen (using only base instructions) is already efficient.
> >
> > Note that the PR initially suggested to use SVE reduction ops. However,
> > they have higher latency than the proposed sequence, which is why using
> > neon and base instructions is preferable.
> >
> > Test cases were added for 8/16-bit integers for all implemented modes and all
> > three operations to check the produced assembly.
> >
> > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> > because for aarch64 vector types, either the logical reduction optabs are
> > implemented or the codegen for reduction operations is good as it is.
> > This was motivated by failure of a scan-tree-dump directive in the test cases
> > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> >
> > The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> > OK for mainline?
> >
> > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> >
> > gcc/
> > 	PR target/113816
> > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> > 	Implement for logical bitwise operations for VDQV_E.
> >
> > gcc/testsuite/
> > 	PR target/113816
> > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > ---
> >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
> >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
> >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
> >  gcc/testsuite/lib/target-supports.exp         |   4 +-
> >  4 files changed, 267 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> > index 23c03a96371..00286b8b020 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -3608,6 +3608,61 @@
> >    }
> >  )
> >  
> > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> > +;; on the two halves of the input.
> > +;; If the input has 128 bits, the first operation is performed in vector
> > +;; registers.  From 64 bits down, the reduction steps are performed in general
> > +;; purpose registers.
> > +;; For example, for V8HI and operation AND, the intended sequence is:
> > +;; EXT      v1.16b, v0.16b, v0.16b, #8
> > +;; AND      v0.8b, v1.8b, v0.8b
> > +;; FMOV     x0, d0
> > +;; AND      x0, x0, x0, 32
> > +;; AND      w0, w0, w0, 16
> > +;;
> > +;; For V8QI and operation AND, the sequence is:
> > +;; AND      x0, x0, x0, lsr 32
> > +;; AND      w0, w0, w0, lsr, 16
> > +;; AND      w0, w0, w0, lsr, 8
> > +
> > +(define_expand "reduc_<optab>_scal_<mode>"
> > + [(match_operand:<VEL> 0 "register_operand")
> > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> > +  "TARGET_SIMD"
> > +  {
> > +    rtx dst = operands[1];
> > +    rtx tdi = gen_reg_rtx (DImode);
> > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> > +    rtx op1_lo;
> > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> > +      {
> > +	rtx t0 = gen_reg_rtx (<MODE>mode);
> > +	rtx t1 = gen_reg_rtx (DImode);
> > +	rtx t2 = gen_reg_rtx (DImode);
> > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> > +	emit_move_insn (tdi, t1);
> > +      }
> > +    else
> > +      {
> > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> > +	emit_move_insn (tdi, op1_lo);
> > +      }
> > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> > +    DONE;
> > +  }
> > +)
> 
> Sorry to be awkward, but I've got mixed feelings about this.
> The sequence produced looks good.  But we're not defining this
> optab because the target has a special instruction for implementing
> the operation.  We're just implementing it to tweak the loop emitted
> by vect_create_epilog_for_reduction.  Maybe we should change that loop
> instead, perhaps with a target hook?

So does it currently not work because there's no v8qi, v4qi or v2qi
vector modes?  I'll note there already _is_ 
targetm.vectorize.split_reduction (mode), but it expects a vector
mode as result and it doesn't get the operation in.

I would expect that we can improve on this existing machinery
instead of inventing a new one.

I'll note that providing reduc_scal optabs has the side-effect of
enabling BB reduction vectorization since that doesn't use any
open-coding (yet - it was indeed supposed to be an indication of
whether such a reduction would be reasonably cheap).

Richard.

> Also, this seems related to the fact that:
> 
> typedef unsigned char v8qi __attribute__((vector_size(8)));
> unsigned char foo(v8qi x) {
>   x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
>   return x[0];
> }
> 
> generates:
> 
>         ext     v31.8b, v0.8b, v0.8b, #1
>         and     v31.8b, v31.8b, v0.8b
>         umov    x0, v31.d[0]
>         ret
> 
> instead of something like:
> 
>         umov    w0, v31.h[0]
>         and     w0, w0, lsr #8
>         ret
> 
> So an alternative might be to do some post-vectorisation simplification,
> perhaps in isel?
> 
> cc:ing Richi for his thoughts.
> 
> Thanks,
> Richard
> 
> > +
> >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> >   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
> >         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > new file mode 100644
> > index 00000000000..9508288b218
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > @@ -0,0 +1,208 @@
> > +/* { dg-options "-O2 -ftree-vectorize" } */
> > +/* { dg-final { check-function-bodies "**" "" } } */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** fv16qi_and:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	and	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_and (int8_t *a)
> > +{
> > +  int8_t b = -1;
> > +  for (int i = 0; i < 16; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_and:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_and (int16_t *a)
> > +{
> > +  int16_t b = -1;
> > +  for (int i = 0; i < 8; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv16qi_or:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	orr	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_or (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 16; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_or:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_or (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv16qi_xor:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	eor	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv16qi_xor (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 16; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8hi_xor:
> > +**	ldr	q([0-9]+), \[x0\]
> > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > +**	fmov	x0, d\1
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv8hi_xor (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_and:
> > +**	ldr	x0, \[x0\]
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	and	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_and (int8_t *a)
> > +{
> > +  int8_t b = -1;
> > +  for (int i = 0; i < 8; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_and:
> > +**	ldr	x0, \[x0\]
> > +**	and	x0, x0, x0, lsr 32
> > +**	and	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_and (int16_t *a)
> > +{
> > +  int16_t b = -1;
> > +  for (int i = 0; i < 4; ++i)
> > +    b &= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_or:
> > +**	ldr	x0, \[x0\]
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	orr	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_or (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_or:
> > +**	ldr	x0, \[x0\]
> > +**	orr	x0, x0, x0, lsr 32
> > +**	orr	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_or (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 4; ++i)
> > +    b |= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv8qi_xor:
> > +**	ldr	x0, \[x0\]
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	eor	w0, w0, w0, lsr 8
> > +**	ret
> > +*/
> > +int8_t fv8qi_xor (int8_t *a)
> > +{
> > +  int8_t b = 0;
> > +  for (int i = 0; i < 8; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > +
> > +/*
> > +** fv4hi_xor:
> > +**	ldr	x0, \[x0\]
> > +**	eor	x0, x0, x0, lsr 32
> > +**	eor	w0, w0, w0, lsr 16
> > +**	ret
> > +*/
> > +int16_t fv4hi_xor (int16_t *a)
> > +{
> > +  int16_t b = 0;
> > +  for (int i = 0; i < 4; ++i)
> > +    b ^= a[i];
> > +  return b;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > index 918822a7d00..70c4ca18094 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
> >    return 0;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> > +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } } */
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> > index 8f2afe866c7..44f737f15d0 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
> >  		   || [istarget amdgcn-*-*]
> >  		   || [check_effective_target_riscv_v]
> >  		   || [check_effective_target_loongarch_sx]
> > -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> > +		   || [istarget i?86-*-*]
> > +		   || [istarget x86_64-*-*]
> > +		   || [istarget aarch64*-*-*]}]
> >  }
> >  
> >  # Return 1 if the target supports the fold_extract_last optab.
>
Tamar Christina Oct. 11, 2024, 7:06 a.m. UTC | #6
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Friday, October 11, 2024 7:52 AM
> To: Richard Sandiford <Richard.Sandiford@arm.com>
> Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
> Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
> <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> reductions
> 
> On Thu, 10 Oct 2024, Richard Sandiford wrote:
> 
> > Jennifer Schmitz <jschmitz@nvidia.com> writes:
> > > This patch implements the optabs reduc_and_scal_<mode>,
> > > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> > > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
> logical
> > > vector reduction operations.
> > > Previously, either only vector registers or only general purpose registers (GPR)
> > > were used. Now, vector registers are used for the reduction from 128 to 64
> bits;
> > > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
> used
> > > for the rest of the reduction steps.
> > >
> > > For example, the test case (V8HI)
> > > int16_t foo (int16_t *a)
> > > {
> > >   int16_t b = -1;
> > >   for (int i = 0; i < 8; ++i)
> > >     b &= a[i];
> > >   return b;
> > > }
> > >
> > > was previously compiled to (-O2):
> > > foo:
> > > 	ldr     q0, [x0]
> > > 	movi    v30.4s, 0
> > > 	ext     v29.16b, v0.16b, v30.16b, #8
> > > 	and     v29.16b, v29.16b, v0.16b
> > > 	ext     v31.16b, v29.16b, v30.16b, #4
> > > 	and     v31.16b, v31.16b, v29.16b
> > > 	ext     v30.16b, v31.16b, v30.16b, #2
> > > 	and     v30.16b, v30.16b, v31.16b
> > > 	umov    w0, v30.h[0]
> > > 	ret
> > >
> > > With patch, it is compiled to:
> > > foo:
> > > 	ldr     q31, [x0]
> > > 	ext     v30.16b, v31.16b, v31.16b, #8
> > > 	and     v31.8b, v30.8b, v31.8b
> > > 	fmov    x0, d31
> > > 	and     x0, x0, x0, lsr 32
> > > 	and     w0, w0, w0, lsr 16
> > > 	ret
> > >
> > > For modes V4SI and V2DI, the pattern was not implemented, because the
> > > current codegen (using only base instructions) is already efficient.
> > >
> > > Note that the PR initially suggested to use SVE reduction ops. However,
> > > they have higher latency than the proposed sequence, which is why using
> > > neon and base instructions is preferable.
> > >
> > > Test cases were added for 8/16-bit integers for all implemented modes and all
> > > three operations to check the produced assembly.
> > >
> > > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> > > because for aarch64 vector types, either the logical reduction optabs are
> > > implemented or the codegen for reduction operations is good as it is.
> > > This was motivated by failure of a scan-tree-dump directive in the test cases
> > > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> > >
> > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no
> regression.
> > > OK for mainline?
> > >
> > > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> > >
> > > gcc/
> > > 	PR target/113816
> > > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> > > 	Implement for logical bitwise operations for VDQV_E.
> > >
> > > gcc/testsuite/
> > > 	PR target/113816
> > > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> > > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > > ---
> > >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
> > >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
> > >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
> > >  gcc/testsuite/lib/target-supports.exp         |   4 +-
> > >  4 files changed, 267 insertions(+), 2 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > > index 23c03a96371..00286b8b020 100644
> > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > @@ -3608,6 +3608,61 @@
> > >    }
> > >  )
> > >
> > > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> > > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> > > +;; on the two halves of the input.
> > > +;; If the input has 128 bits, the first operation is performed in vector
> > > +;; registers.  From 64 bits down, the reduction steps are performed in general
> > > +;; purpose registers.
> > > +;; For example, for V8HI and operation AND, the intended sequence is:
> > > +;; EXT      v1.16b, v0.16b, v0.16b, #8
> > > +;; AND      v0.8b, v1.8b, v0.8b
> > > +;; FMOV     x0, d0
> > > +;; AND      x0, x0, x0, 32
> > > +;; AND      w0, w0, w0, 16
> > > +;;
> > > +;; For V8QI and operation AND, the sequence is:
> > > +;; AND      x0, x0, x0, lsr 32
> > > +;; AND      w0, w0, w0, lsr, 16
> > > +;; AND      w0, w0, w0, lsr, 8
> > > +
> > > +(define_expand "reduc_<optab>_scal_<mode>"
> > > + [(match_operand:<VEL> 0 "register_operand")
> > > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> > > +  "TARGET_SIMD"
> > > +  {
> > > +    rtx dst = operands[1];
> > > +    rtx tdi = gen_reg_rtx (DImode);
> > > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> > > +    rtx op1_lo;
> > > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> > > +      {
> > > +	rtx t0 = gen_reg_rtx (<MODE>mode);
> > > +	rtx t1 = gen_reg_rtx (DImode);
> > > +	rtx t2 = gen_reg_rtx (DImode);
> > > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> > > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> > > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> > > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> > > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> > > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> > > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> > > +	emit_move_insn (tdi, t1);
> > > +      }
> > > +    else
> > > +      {
> > > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> > > +	emit_move_insn (tdi, op1_lo);
> > > +      }
> > > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> > > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> > > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> > > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> > > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> > > +    DONE;
> > > +  }
> > > +)
> >
> > Sorry to be awkward, but I've got mixed feelings about this.
> > The sequence produced looks good.  But we're not defining this
> > optab because the target has a special instruction for implementing
> > the operation.  We're just implementing it to tweak the loop emitted
> > by vect_create_epilog_for_reduction.  Maybe we should change that loop
> > instead, perhaps with a target hook?
> 
> So does it currently not work because there's no v8qi, v4qi or v2qi
> vector modes?  I'll note there already _is_
> targetm.vectorize.split_reduction (mode), but it expects a vector
> mode as result and it doesn't get the operation in.
> 

The point of the reduction is that they shouldn't be done on the vector side.
So even if we have v4qi and v2qi the reduction shouldn't be done there as
the latency and throughput for these sub 64-bit operations on the vector
side is much higher and so should be done on the integer side. i.e. these
should be done on DI and SI.

This is why I disagree with Richard that this can be done generically for all
targets. The decomposition takes advantage of Arm's fused shifted logical
operations on GPR.  This makes them really cheap to do on integer.

Richard brought up AArch32 as another example. I don't agree there either
Because AArch32's register file overlap means you don't need shifts.
The first step reduction can just be done by accessing the registers
as even/odd pairs on a smaller mode.

To get this codegen generically the targets must supports modes tie-able
between the vector and smaller vector size. i.e.  BIT_FIELD_REF on the
vector mode to a smaller mode must be cheap or free.

The decision for when you do the transfer from SIMD to GPR is highly
target specific, and the how as well.

> I would expect that we can improve on this existing machinery
> instead of inventing a new one.
> 

But we're not. We're just implementing the optab to the machinery
that already exists.

I think we're just over engineering a new contributor's patch. Even if
we think there is some merit to do it generically, I don't see why we
shouldn't take this patch today.

Thanks,
Tamar

> I'll note that providing reduc_scal optabs has the side-effect of
> enabling BB reduction vectorization since that doesn't use any
> open-coding (yet - it was indeed supposed to be an indication of
> whether such a reduction would be reasonably cheap).
> 
> Richard.
> 
> > Also, this seems related to the fact that:
> >
> > typedef unsigned char v8qi __attribute__((vector_size(8)));
> > unsigned char foo(v8qi x) {
> >   x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
> >   return x[0];
> > }
> >
> > generates:
> >
> >         ext     v31.8b, v0.8b, v0.8b, #1
> >         and     v31.8b, v31.8b, v0.8b
> >         umov    x0, v31.d[0]
> >         ret
> >
> > instead of something like:
> >
> >         umov    w0, v31.h[0]
> >         and     w0, w0, lsr #8
> >         ret
> >
> > So an alternative might be to do some post-vectorisation simplification,
> > perhaps in isel?
> >
> > cc:ing Richi for his thoughts.
> >
> > Thanks,
> > Richard
> >
> > > +
> > >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> > >   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
> > >         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > new file mode 100644
> > > index 00000000000..9508288b218
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > @@ -0,0 +1,208 @@
> > > +/* { dg-options "-O2 -ftree-vectorize" } */
> > > +/* { dg-final { check-function-bodies "**" "" } } */
> > > +
> > > +#include <stdint.h>
> > > +
> > > +/*
> > > +** fv16qi_and:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	and	x0, x0, x0, lsr 32
> > > +**	and	w0, w0, w0, lsr 16
> > > +**	and	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv16qi_and (int8_t *a)
> > > +{
> > > +  int8_t b = -1;
> > > +  for (int i = 0; i < 16; ++i)
> > > +    b &= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8hi_and:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	and	x0, x0, x0, lsr 32
> > > +**	and	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv8hi_and (int16_t *a)
> > > +{
> > > +  int16_t b = -1;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b &= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv16qi_or:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	orr	x0, x0, x0, lsr 32
> > > +**	orr	w0, w0, w0, lsr 16
> > > +**	orr	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv16qi_or (int8_t *a)
> > > +{
> > > +  int8_t b = 0;
> > > +  for (int i = 0; i < 16; ++i)
> > > +    b |= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8hi_or:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	orr	x0, x0, x0, lsr 32
> > > +**	orr	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv8hi_or (int16_t *a)
> > > +{
> > > +  int16_t b = 0;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b |= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv16qi_xor:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	eor	x0, x0, x0, lsr 32
> > > +**	eor	w0, w0, w0, lsr 16
> > > +**	eor	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv16qi_xor (int8_t *a)
> > > +{
> > > +  int8_t b = 0;
> > > +  for (int i = 0; i < 16; ++i)
> > > +    b ^= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8hi_xor:
> > > +**	ldr	q([0-9]+), \[x0\]
> > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > > +**	fmov	x0, d\1
> > > +**	eor	x0, x0, x0, lsr 32
> > > +**	eor	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv8hi_xor (int16_t *a)
> > > +{
> > > +  int16_t b = 0;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b ^= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8qi_and:
> > > +**	ldr	x0, \[x0\]
> > > +**	and	x0, x0, x0, lsr 32
> > > +**	and	w0, w0, w0, lsr 16
> > > +**	and	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv8qi_and (int8_t *a)
> > > +{
> > > +  int8_t b = -1;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b &= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv4hi_and:
> > > +**	ldr	x0, \[x0\]
> > > +**	and	x0, x0, x0, lsr 32
> > > +**	and	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv4hi_and (int16_t *a)
> > > +{
> > > +  int16_t b = -1;
> > > +  for (int i = 0; i < 4; ++i)
> > > +    b &= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8qi_or:
> > > +**	ldr	x0, \[x0\]
> > > +**	orr	x0, x0, x0, lsr 32
> > > +**	orr	w0, w0, w0, lsr 16
> > > +**	orr	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv8qi_or (int8_t *a)
> > > +{
> > > +  int8_t b = 0;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b |= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv4hi_or:
> > > +**	ldr	x0, \[x0\]
> > > +**	orr	x0, x0, x0, lsr 32
> > > +**	orr	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv4hi_or (int16_t *a)
> > > +{
> > > +  int16_t b = 0;
> > > +  for (int i = 0; i < 4; ++i)
> > > +    b |= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv8qi_xor:
> > > +**	ldr	x0, \[x0\]
> > > +**	eor	x0, x0, x0, lsr 32
> > > +**	eor	w0, w0, w0, lsr 16
> > > +**	eor	w0, w0, w0, lsr 8
> > > +**	ret
> > > +*/
> > > +int8_t fv8qi_xor (int8_t *a)
> > > +{
> > > +  int8_t b = 0;
> > > +  for (int i = 0; i < 8; ++i)
> > > +    b ^= a[i];
> > > +  return b;
> > > +}
> > > +
> > > +/*
> > > +** fv4hi_xor:
> > > +**	ldr	x0, \[x0\]
> > > +**	eor	x0, x0, x0, lsr 32
> > > +**	eor	w0, w0, w0, lsr 16
> > > +**	ret
> > > +*/
> > > +int16_t fv4hi_xor (int16_t *a)
> > > +{
> > > +  int16_t b = 0;
> > > +  for (int i = 0; i < 4; ++i)
> > > +    b ^= a[i];
> > > +  return b;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > index 918822a7d00..70c4ca18094 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
> > >    return 0;
> > >  }
> > >
> > > -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> > > +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } }
> */
> > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> > > index 8f2afe866c7..44f737f15d0 100644
> > > --- a/gcc/testsuite/lib/target-supports.exp
> > > +++ b/gcc/testsuite/lib/target-supports.exp
> > > @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
> > >  		   || [istarget amdgcn-*-*]
> > >  		   || [check_effective_target_riscv_v]
> > >  		   || [check_effective_target_loongarch_sx]
> > > -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> > > +		   || [istarget i?86-*-*]
> > > +		   || [istarget x86_64-*-*]
> > > +		   || [istarget aarch64*-*-*]}]
> > >  }
> > >
> > >  # Return 1 if the target supports the fold_extract_last optab.
> >
> 
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
Richard Biener Oct. 11, 2024, 7:13 a.m. UTC | #7
On Fri, 11 Oct 2024, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, October 11, 2024 7:52 AM
> > To: Richard Sandiford <Richard.Sandiford@arm.com>
> > Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
> > Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
> > <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
> > Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
> > reductions
> > 
> > On Thu, 10 Oct 2024, Richard Sandiford wrote:
> > 
> > > Jennifer Schmitz <jschmitz@nvidia.com> writes:
> > > > This patch implements the optabs reduc_and_scal_<mode>,
> > > > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
> > > > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
> > logical
> > > > vector reduction operations.
> > > > Previously, either only vector registers or only general purpose registers (GPR)
> > > > were used. Now, vector registers are used for the reduction from 128 to 64
> > bits;
> > > > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
> > used
> > > > for the rest of the reduction steps.
> > > >
> > > > For example, the test case (V8HI)
> > > > int16_t foo (int16_t *a)
> > > > {
> > > >   int16_t b = -1;
> > > >   for (int i = 0; i < 8; ++i)
> > > >     b &= a[i];
> > > >   return b;
> > > > }
> > > >
> > > > was previously compiled to (-O2):
> > > > foo:
> > > > 	ldr     q0, [x0]
> > > > 	movi    v30.4s, 0
> > > > 	ext     v29.16b, v0.16b, v30.16b, #8
> > > > 	and     v29.16b, v29.16b, v0.16b
> > > > 	ext     v31.16b, v29.16b, v30.16b, #4
> > > > 	and     v31.16b, v31.16b, v29.16b
> > > > 	ext     v30.16b, v31.16b, v30.16b, #2
> > > > 	and     v30.16b, v30.16b, v31.16b
> > > > 	umov    w0, v30.h[0]
> > > > 	ret
> > > >
> > > > With patch, it is compiled to:
> > > > foo:
> > > > 	ldr     q31, [x0]
> > > > 	ext     v30.16b, v31.16b, v31.16b, #8
> > > > 	and     v31.8b, v30.8b, v31.8b
> > > > 	fmov    x0, d31
> > > > 	and     x0, x0, x0, lsr 32
> > > > 	and     w0, w0, w0, lsr 16
> > > > 	ret
> > > >
> > > > For modes V4SI and V2DI, the pattern was not implemented, because the
> > > > current codegen (using only base instructions) is already efficient.
> > > >
> > > > Note that the PR initially suggested to use SVE reduction ops. However,
> > > > they have higher latency than the proposed sequence, which is why using
> > > > neon and base instructions is preferable.
> > > >
> > > > Test cases were added for 8/16-bit integers for all implemented modes and all
> > > > three operations to check the produced assembly.
> > > >
> > > > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
> > > > because for aarch64 vector types, either the logical reduction optabs are
> > > > implemented or the codegen for reduction operations is good as it is.
> > > > This was motivated by failure of a scan-tree-dump directive in the test cases
> > > > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
> > > >
> > > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no
> > regression.
> > > > OK for mainline?
> > > >
> > > > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
> > > >
> > > > gcc/
> > > > 	PR target/113816
> > > > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
> > > > 	Implement for logical bitwise operations for VDQV_E.
> > > >
> > > > gcc/testsuite/
> > > > 	PR target/113816
> > > > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
> > > > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
> > > > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
> > > > ---
> > > >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
> > > >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
> > > >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
> > > >  gcc/testsuite/lib/target-supports.exp         |   4 +-
> > > >  4 files changed, 267 insertions(+), 2 deletions(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > >
> > > > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > > > index 23c03a96371..00286b8b020 100644
> > > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > > @@ -3608,6 +3608,61 @@
> > > >    }
> > > >  )
> > > >
> > > > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
> > > > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
> > > > +;; on the two halves of the input.
> > > > +;; If the input has 128 bits, the first operation is performed in vector
> > > > +;; registers.  From 64 bits down, the reduction steps are performed in general
> > > > +;; purpose registers.
> > > > +;; For example, for V8HI and operation AND, the intended sequence is:
> > > > +;; EXT      v1.16b, v0.16b, v0.16b, #8
> > > > +;; AND      v0.8b, v1.8b, v0.8b
> > > > +;; FMOV     x0, d0
> > > > +;; AND      x0, x0, x0, 32
> > > > +;; AND      w0, w0, w0, 16
> > > > +;;
> > > > +;; For V8QI and operation AND, the sequence is:
> > > > +;; AND      x0, x0, x0, lsr 32
> > > > +;; AND      w0, w0, w0, lsr, 16
> > > > +;; AND      w0, w0, w0, lsr, 8
> > > > +
> > > > +(define_expand "reduc_<optab>_scal_<mode>"
> > > > + [(match_operand:<VEL> 0 "register_operand")
> > > > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
> > > > +  "TARGET_SIMD"
> > > > +  {
> > > > +    rtx dst = operands[1];
> > > > +    rtx tdi = gen_reg_rtx (DImode);
> > > > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
> > > > +    rtx op1_lo;
> > > > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
> > > > +      {
> > > > +	rtx t0 = gen_reg_rtx (<MODE>mode);
> > > > +	rtx t1 = gen_reg_rtx (DImode);
> > > > +	rtx t2 = gen_reg_rtx (DImode);
> > > > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
> > > > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
> > > > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
> > > > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
> > > > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
> > > > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
> > > > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
> > > > +	emit_move_insn (tdi, t1);
> > > > +      }
> > > > +    else
> > > > +      {
> > > > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
> > > > +	emit_move_insn (tdi, op1_lo);
> > > > +      }
> > > > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
> > > > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
> > > > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
> > > > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
> > > > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
> > > > +    DONE;
> > > > +  }
> > > > +)
> > >
> > > Sorry to be awkward, but I've got mixed feelings about this.
> > > The sequence produced looks good.  But we're not defining this
> > > optab because the target has a special instruction for implementing
> > > the operation.  We're just implementing it to tweak the loop emitted
> > > by vect_create_epilog_for_reduction.  Maybe we should change that loop
> > > instead, perhaps with a target hook?
> > 
> > So does it currently not work because there's no v8qi, v4qi or v2qi
> > vector modes?  I'll note there already _is_
> > targetm.vectorize.split_reduction (mode), but it expects a vector
> > mode as result and it doesn't get the operation in.
> > 
> 
> The point of the reduction is that they shouldn't be done on the vector side.
> So even if we have v4qi and v2qi the reduction shouldn't be done there as
> the latency and throughput for these sub 64-bit operations on the vector
> side is much higher and so should be done on the integer side. i.e. these
> should be done on DI and SI.
> 
> This is why I disagree with Richard that this can be done generically for all
> targets. The decomposition takes advantage of Arm's fused shifted logical
> operations on GPR.  This makes them really cheap to do on integer.
> 
> Richard brought up AArch32 as another example. I don't agree there either
> Because AArch32's register file overlap means you don't need shifts.
> The first step reduction can just be done by accessing the registers
> as even/odd pairs on a smaller mode.
> 
> To get this codegen generically the targets must supports modes tie-able
> between the vector and smaller vector size. i.e.  BIT_FIELD_REF on the
> vector mode to a smaller mode must be cheap or free.
> 
> The decision for when you do the transfer from SIMD to GPR is highly
> target specific, and the how as well.
> 
> > I would expect that we can improve on this existing machinery
> > instead of inventing a new one.
> > 
> 
> But we're not. We're just implementing the optab to the machinery
> that already exists.
> 
> I think we're just over engineering a new contributor's patch. Even if
> we think there is some merit to do it generically, I don't see why we
> shouldn't take this patch today.

Sure.  I was just saying there is already a target driven piece in
the generic machinery that looks somewhat fitting if we enhance it,
so no need to invent something completely new here.

Richard.

> Thanks,
> Tamar
> 
> > I'll note that providing reduc_scal optabs has the side-effect of
> > enabling BB reduction vectorization since that doesn't use any
> > open-coding (yet - it was indeed supposed to be an indication of
> > whether such a reduction would be reasonably cheap).
> > 
> > Richard.
> > 
> > > Also, this seems related to the fact that:
> > >
> > > typedef unsigned char v8qi __attribute__((vector_size(8)));
> > > unsigned char foo(v8qi x) {
> > >   x &= __builtin_shufflevector (x, x, 1, 2, 3, 4, 5, 6, 7, 0);
> > >   return x[0];
> > > }
> > >
> > > generates:
> > >
> > >         ext     v31.8b, v0.8b, v0.8b, #1
> > >         and     v31.8b, v31.8b, v0.8b
> > >         umov    x0, v31.d[0]
> > >         ret
> > >
> > > instead of something like:
> > >
> > >         umov    w0, v31.h[0]
> > >         and     w0, w0, lsr #8
> > >         ret
> > >
> > > So an alternative might be to do some post-vectorisation simplification,
> > > perhaps in isel?
> > >
> > > cc:ing Richi for his thoughts.
> > >
> > > Thanks,
> > > Richard
> > >
> > > > +
> > > >  (define_insn "aarch64_reduc_<optab>_internal<mode>"
> > > >   [(set (match_operand:VDQV_S 0 "register_operand" "=w")
> > > >         (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > > new file mode 100644
> > > > index 00000000000..9508288b218
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
> > > > @@ -0,0 +1,208 @@
> > > > +/* { dg-options "-O2 -ftree-vectorize" } */
> > > > +/* { dg-final { check-function-bodies "**" "" } } */
> > > > +
> > > > +#include <stdint.h>
> > > > +
> > > > +/*
> > > > +** fv16qi_and:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	and	x0, x0, x0, lsr 32
> > > > +**	and	w0, w0, w0, lsr 16
> > > > +**	and	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv16qi_and (int8_t *a)
> > > > +{
> > > > +  int8_t b = -1;
> > > > +  for (int i = 0; i < 16; ++i)
> > > > +    b &= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8hi_and:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	and	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	and	x0, x0, x0, lsr 32
> > > > +**	and	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv8hi_and (int16_t *a)
> > > > +{
> > > > +  int16_t b = -1;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b &= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv16qi_or:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	orr	x0, x0, x0, lsr 32
> > > > +**	orr	w0, w0, w0, lsr 16
> > > > +**	orr	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv16qi_or (int8_t *a)
> > > > +{
> > > > +  int8_t b = 0;
> > > > +  for (int i = 0; i < 16; ++i)
> > > > +    b |= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8hi_or:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	orr	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	orr	x0, x0, x0, lsr 32
> > > > +**	orr	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv8hi_or (int16_t *a)
> > > > +{
> > > > +  int16_t b = 0;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b |= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv16qi_xor:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	eor	x0, x0, x0, lsr 32
> > > > +**	eor	w0, w0, w0, lsr 16
> > > > +**	eor	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv16qi_xor (int8_t *a)
> > > > +{
> > > > +  int8_t b = 0;
> > > > +  for (int i = 0; i < 16; ++i)
> > > > +    b ^= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8hi_xor:
> > > > +**	ldr	q([0-9]+), \[x0\]
> > > > +**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
> > > > +**	eor	v\1\.8b, v\2\.8b, v\1\.8b
> > > > +**	fmov	x0, d\1
> > > > +**	eor	x0, x0, x0, lsr 32
> > > > +**	eor	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv8hi_xor (int16_t *a)
> > > > +{
> > > > +  int16_t b = 0;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b ^= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8qi_and:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	and	x0, x0, x0, lsr 32
> > > > +**	and	w0, w0, w0, lsr 16
> > > > +**	and	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv8qi_and (int8_t *a)
> > > > +{
> > > > +  int8_t b = -1;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b &= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv4hi_and:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	and	x0, x0, x0, lsr 32
> > > > +**	and	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv4hi_and (int16_t *a)
> > > > +{
> > > > +  int16_t b = -1;
> > > > +  for (int i = 0; i < 4; ++i)
> > > > +    b &= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8qi_or:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	orr	x0, x0, x0, lsr 32
> > > > +**	orr	w0, w0, w0, lsr 16
> > > > +**	orr	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv8qi_or (int8_t *a)
> > > > +{
> > > > +  int8_t b = 0;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b |= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv4hi_or:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	orr	x0, x0, x0, lsr 32
> > > > +**	orr	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv4hi_or (int16_t *a)
> > > > +{
> > > > +  int16_t b = 0;
> > > > +  for (int i = 0; i < 4; ++i)
> > > > +    b |= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv8qi_xor:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	eor	x0, x0, x0, lsr 32
> > > > +**	eor	w0, w0, w0, lsr 16
> > > > +**	eor	w0, w0, w0, lsr 8
> > > > +**	ret
> > > > +*/
> > > > +int8_t fv8qi_xor (int8_t *a)
> > > > +{
> > > > +  int8_t b = 0;
> > > > +  for (int i = 0; i < 8; ++i)
> > > > +    b ^= a[i];
> > > > +  return b;
> > > > +}
> > > > +
> > > > +/*
> > > > +** fv4hi_xor:
> > > > +**	ldr	x0, \[x0\]
> > > > +**	eor	x0, x0, x0, lsr 32
> > > > +**	eor	w0, w0, w0, lsr 16
> > > > +**	ret
> > > > +*/
> > > > +int16_t fv4hi_xor (int16_t *a)
> > > > +{
> > > > +  int16_t b = 0;
> > > > +  for (int i = 0; i < 4; ++i)
> > > > +    b ^= a[i];
> > > > +  return b;
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > > index 918822a7d00..70c4ca18094 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
> > > > @@ -32,4 +32,4 @@ main (unsigned char argc, char **argv)
> > > >    return 0;
> > > >  }
> > > >
> > > > -/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
> > > > +/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } }
> > */
> > > > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> > supports.exp
> > > > index 8f2afe866c7..44f737f15d0 100644
> > > > --- a/gcc/testsuite/lib/target-supports.exp
> > > > +++ b/gcc/testsuite/lib/target-supports.exp
> > > > @@ -9564,7 +9564,9 @@ proc check_effective_target_vect_logical_reduc { } {
> > > >  		   || [istarget amdgcn-*-*]
> > > >  		   || [check_effective_target_riscv_v]
> > > >  		   || [check_effective_target_loongarch_sx]
> > > > -		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
> > > > +		   || [istarget i?86-*-*]
> > > > +		   || [istarget x86_64-*-*]
> > > > +		   || [istarget aarch64*-*-*]}]
> > > >  }
> > > >
> > > >  # Return 1 if the target supports the fold_extract_last optab.
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
Richard Sandiford Oct. 11, 2024, 10:08 a.m. UTC | #8
Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Biener <rguenther@suse.de>
>> Sent: Friday, October 11, 2024 7:52 AM
>> To: Richard Sandiford <Richard.Sandiford@arm.com>
>> Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
>> Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
>> <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>> reductions
>> 
>> On Thu, 10 Oct 2024, Richard Sandiford wrote:
>> 
>> > Jennifer Schmitz <jschmitz@nvidia.com> writes:
>> > > This patch implements the optabs reduc_and_scal_<mode>,
>> > > reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
>> > > V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
>> logical
>> > > vector reduction operations.
>> > > Previously, either only vector registers or only general purpose registers (GPR)
>> > > were used. Now, vector registers are used for the reduction from 128 to 64
>> bits;
>> > > 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
>> used
>> > > for the rest of the reduction steps.
>> > >
>> > > For example, the test case (V8HI)
>> > > int16_t foo (int16_t *a)
>> > > {
>> > >   int16_t b = -1;
>> > >   for (int i = 0; i < 8; ++i)
>> > >     b &= a[i];
>> > >   return b;
>> > > }
>> > >
>> > > was previously compiled to (-O2):
>> > > foo:
>> > > 	ldr     q0, [x0]
>> > > 	movi    v30.4s, 0
>> > > 	ext     v29.16b, v0.16b, v30.16b, #8
>> > > 	and     v29.16b, v29.16b, v0.16b
>> > > 	ext     v31.16b, v29.16b, v30.16b, #4
>> > > 	and     v31.16b, v31.16b, v29.16b
>> > > 	ext     v30.16b, v31.16b, v30.16b, #2
>> > > 	and     v30.16b, v30.16b, v31.16b
>> > > 	umov    w0, v30.h[0]
>> > > 	ret
>> > >
>> > > With patch, it is compiled to:
>> > > foo:
>> > > 	ldr     q31, [x0]
>> > > 	ext     v30.16b, v31.16b, v31.16b, #8
>> > > 	and     v31.8b, v30.8b, v31.8b
>> > > 	fmov    x0, d31
>> > > 	and     x0, x0, x0, lsr 32
>> > > 	and     w0, w0, w0, lsr 16
>> > > 	ret
>> > >
>> > > For modes V4SI and V2DI, the pattern was not implemented, because the
>> > > current codegen (using only base instructions) is already efficient.
>> > >
>> > > Note that the PR initially suggested to use SVE reduction ops. However,
>> > > they have higher latency than the proposed sequence, which is why using
>> > > neon and base instructions is preferable.
>> > >
>> > > Test cases were added for 8/16-bit integers for all implemented modes and all
>> > > three operations to check the produced assembly.
>> > >
>> > > We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
>> > > because for aarch64 vector types, either the logical reduction optabs are
>> > > implemented or the codegen for reduction operations is good as it is.
>> > > This was motivated by failure of a scan-tree-dump directive in the test cases
>> > > gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>> > >
>> > > The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>> regression.
>> > > OK for mainline?
>> > >
>> > > Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>> > >
>> > > gcc/
>> > > 	PR target/113816
>> > > 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>> > > 	Implement for logical bitwise operations for VDQV_E.
>> > >
>> > > gcc/testsuite/
>> > > 	PR target/113816
>> > > 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>> > > 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
>> > > 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
>> > > ---
>> > >  gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>> > >  .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>> > >  .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>> > >  gcc/testsuite/lib/target-supports.exp         |   4 +-
>> > >  4 files changed, 267 insertions(+), 2 deletions(-)
>> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>> > >
>> > > diff --git a/gcc/config/aarch64/aarch64-simd.md
>> b/gcc/config/aarch64/aarch64-simd.md
>> > > index 23c03a96371..00286b8b020 100644
>> > > --- a/gcc/config/aarch64/aarch64-simd.md
>> > > +++ b/gcc/config/aarch64/aarch64-simd.md
>> > > @@ -3608,6 +3608,61 @@
>> > >    }
>> > >  )
>> > >
>> > > +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
>> > > +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
>> > > +;; on the two halves of the input.
>> > > +;; If the input has 128 bits, the first operation is performed in vector
>> > > +;; registers.  From 64 bits down, the reduction steps are performed in general
>> > > +;; purpose registers.
>> > > +;; For example, for V8HI and operation AND, the intended sequence is:
>> > > +;; EXT      v1.16b, v0.16b, v0.16b, #8
>> > > +;; AND      v0.8b, v1.8b, v0.8b
>> > > +;; FMOV     x0, d0
>> > > +;; AND      x0, x0, x0, 32
>> > > +;; AND      w0, w0, w0, 16
>> > > +;;
>> > > +;; For V8QI and operation AND, the sequence is:
>> > > +;; AND      x0, x0, x0, lsr 32
>> > > +;; AND      w0, w0, w0, lsr, 16
>> > > +;; AND      w0, w0, w0, lsr, 8
>> > > +
>> > > +(define_expand "reduc_<optab>_scal_<mode>"
>> > > + [(match_operand:<VEL> 0 "register_operand")
>> > > +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
>> > > +  "TARGET_SIMD"
>> > > +  {
>> > > +    rtx dst = operands[1];
>> > > +    rtx tdi = gen_reg_rtx (DImode);
>> > > +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
>> > > +    rtx op1_lo;
>> > > +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
>> > > +      {
>> > > +	rtx t0 = gen_reg_rtx (<MODE>mode);
>> > > +	rtx t1 = gen_reg_rtx (DImode);
>> > > +	rtx t2 = gen_reg_rtx (DImode);
>> > > +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
>> > > +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
>> > > +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
>> > > +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
>> > > +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
>> > > +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
>> > > +	emit_insn (gen_<optab>di3 (t1, t1, t2));
>> > > +	emit_move_insn (tdi, t1);
>> > > +      }
>> > > +    else
>> > > +      {
>> > > +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
>> > > +	emit_move_insn (tdi, op1_lo);
>> > > +      }
>> > > +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
>> > > +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
>> > > +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
>> > > +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
>> > > +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
>> > > +    DONE;
>> > > +  }
>> > > +)
>> >
>> > Sorry to be awkward, but I've got mixed feelings about this.
>> > The sequence produced looks good.  But we're not defining this
>> > optab because the target has a special instruction for implementing
>> > the operation.  We're just implementing it to tweak the loop emitted
>> > by vect_create_epilog_for_reduction.  Maybe we should change that loop
>> > instead, perhaps with a target hook?
>> 
>> So does it currently not work because there's no v8qi, v4qi or v2qi
>> vector modes?  I'll note there already _is_
>> targetm.vectorize.split_reduction (mode), but it expects a vector
>> mode as result and it doesn't get the operation in.
>> 
>
> The point of the reduction is that they shouldn't be done on the vector side.
> So even if we have v4qi and v2qi the reduction shouldn't be done there as
> the latency and throughput for these sub 64-bit operations on the vector
> side is much higher and so should be done on the integer side. i.e. these
> should be done on DI and SI.
>
> This is why I disagree with Richard that this can be done generically for all
> targets. The decomposition takes advantage of Arm's fused shifted logical
> operations on GPR.  This makes them really cheap to do on integer.

Wouldn't it still be a win without that though?  Having the fused operation
gives us a ~4x improvement in latency on a typical core (EXT + vector AND
-> fused scalar AND).  But the unfused operation would still be a ~2x
improvement.

> Richard brought up AArch32 as another example. I don't agree there either
> Because AArch32's register file overlap means you don't need shifts.
> The first step reduction can just be done by accessing the registers
> as even/odd pairs on a smaller mode.
>
> To get this codegen generically the targets must supports modes tie-able
> between the vector and smaller vector size. i.e.  BIT_FIELD_REF on the
> vector mode to a smaller mode must be cheap or free.
>
> The decision for when you do the transfer from SIMD to GPR is highly
> target specific, and the how as well.
>
>> I would expect that we can improve on this existing machinery
>> instead of inventing a new one.
>> 
>
> But we're not. We're just implementing the optab to the machinery
> that already exists.

I don't think this kind of expansion is what optabs are there for though.
I suppose it was in the pre-gimple days, when all optimisation was done
on RTL.  (Although of course that also predates vectorisation.)  But now,
I think the purpose of optabs is to advertise what the target can do via
some form of direct h/w support.

The original patch (using ANDV) was an ideal use of optabs.  But I think
the request to use a tweaked version of the current open-coded reduction
scheme, although a good suggestion, also changes the way that we should
go about it.

> I think we're just over engineering a new contributor's patch. Even if
> we think there is some merit to do it generically, I don't see why we
> shouldn't take this patch today.

Sorry if it comes across that way.

Thanks,
Richard
Jeff Law Oct. 11, 2024, 2:06 p.m. UTC | #9
On 10/11/24 4:08 AM, Richard Sandiford wrote:
> Tamar Christina <Tamar.Christina@arm.com> writes:
>>> -----Original Message-----
>>> From: Richard Biener <rguenther@suse.de>
>>> Sent: Friday, October 11, 2024 7:52 AM
>>> To: Richard Sandiford <Richard.Sandiford@arm.com>
>>> Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
>>> Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
>>> <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
>>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>>> reductions
>>>
>>> On Thu, 10 Oct 2024, Richard Sandiford wrote:
>>>
>>>> Jennifer Schmitz <jschmitz@nvidia.com> writes:
>>>>> This patch implements the optabs reduc_and_scal_<mode>,
>>>>> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
>>>>> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
>>> logical
>>>>> vector reduction operations.
>>>>> Previously, either only vector registers or only general purpose registers (GPR)
>>>>> were used. Now, vector registers are used for the reduction from 128 to 64
>>> bits;
>>>>> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
>>> used
>>>>> for the rest of the reduction steps.
>>>>>
>>>>> For example, the test case (V8HI)
>>>>> int16_t foo (int16_t *a)
>>>>> {
>>>>>    int16_t b = -1;
>>>>>    for (int i = 0; i < 8; ++i)
>>>>>      b &= a[i];
>>>>>    return b;
>>>>> }
>>>>>
>>>>> was previously compiled to (-O2):
>>>>> foo:
>>>>> 	ldr     q0, [x0]
>>>>> 	movi    v30.4s, 0
>>>>> 	ext     v29.16b, v0.16b, v30.16b, #8
>>>>> 	and     v29.16b, v29.16b, v0.16b
>>>>> 	ext     v31.16b, v29.16b, v30.16b, #4
>>>>> 	and     v31.16b, v31.16b, v29.16b
>>>>> 	ext     v30.16b, v31.16b, v30.16b, #2
>>>>> 	and     v30.16b, v30.16b, v31.16b
>>>>> 	umov    w0, v30.h[0]
>>>>> 	ret
>>>>>
>>>>> With patch, it is compiled to:
>>>>> foo:
>>>>> 	ldr     q31, [x0]
>>>>> 	ext     v30.16b, v31.16b, v31.16b, #8
>>>>> 	and     v31.8b, v30.8b, v31.8b
>>>>> 	fmov    x0, d31
>>>>> 	and     x0, x0, x0, lsr 32
>>>>> 	and     w0, w0, w0, lsr 16
>>>>> 	ret
>>>>>
>>>>> For modes V4SI and V2DI, the pattern was not implemented, because the
>>>>> current codegen (using only base instructions) is already efficient.
>>>>>
>>>>> Note that the PR initially suggested to use SVE reduction ops. However,
>>>>> they have higher latency than the proposed sequence, which is why using
>>>>> neon and base instructions is preferable.
>>>>>
>>>>> Test cases were added for 8/16-bit integers for all implemented modes and all
>>>>> three operations to check the produced assembly.
>>>>>
>>>>> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
>>>>> because for aarch64 vector types, either the logical reduction optabs are
>>>>> implemented or the codegen for reduction operations is good as it is.
>>>>> This was motivated by failure of a scan-tree-dump directive in the test cases
>>>>> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>>>>>
>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>>> regression.
>>>>> OK for mainline?
>>>>>
>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>>>
>>>>> gcc/
>>>>> 	PR target/113816
>>>>> 	* config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>>>>> 	Implement for logical bitwise operations for VDQV_E.
>>>>>
>>>>> gcc/testsuite/
>>>>> 	PR target/113816
>>>>> 	* lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>>>>> 	* gcc.target/aarch64/simd/logical_reduc.c: New test.
>>>>> 	* gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
>>>>> ---
>>>>>   gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>>>>>   .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>>>>>   .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>>>>>   gcc/testsuite/lib/target-supports.exp         |   4 +-
>>>>>   4 files changed, 267 insertions(+), 2 deletions(-)
>>>>>   create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>>>>>
>>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
>>>>> index 23c03a96371..00286b8b020 100644
>>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>>> @@ -3608,6 +3608,61 @@
>>>>>     }
>>>>>   )
>>>>>
>>>>> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
>>>>> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
>>>>> +;; on the two halves of the input.
>>>>> +;; If the input has 128 bits, the first operation is performed in vector
>>>>> +;; registers.  From 64 bits down, the reduction steps are performed in general
>>>>> +;; purpose registers.
>>>>> +;; For example, for V8HI and operation AND, the intended sequence is:
>>>>> +;; EXT      v1.16b, v0.16b, v0.16b, #8
>>>>> +;; AND      v0.8b, v1.8b, v0.8b
>>>>> +;; FMOV     x0, d0
>>>>> +;; AND      x0, x0, x0, 32
>>>>> +;; AND      w0, w0, w0, 16
>>>>> +;;
>>>>> +;; For V8QI and operation AND, the sequence is:
>>>>> +;; AND      x0, x0, x0, lsr 32
>>>>> +;; AND      w0, w0, w0, lsr, 16
>>>>> +;; AND      w0, w0, w0, lsr, 8
>>>>> +
>>>>> +(define_expand "reduc_<optab>_scal_<mode>"
>>>>> + [(match_operand:<VEL> 0 "register_operand")
>>>>> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
>>>>> +  "TARGET_SIMD"
>>>>> +  {
>>>>> +    rtx dst = operands[1];
>>>>> +    rtx tdi = gen_reg_rtx (DImode);
>>>>> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
>>>>> +    rtx op1_lo;
>>>>> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
>>>>> +      {
>>>>> +	rtx t0 = gen_reg_rtx (<MODE>mode);
>>>>> +	rtx t1 = gen_reg_rtx (DImode);
>>>>> +	rtx t2 = gen_reg_rtx (DImode);
>>>>> +	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
>>>>> +	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
>>>>> +	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
>>>>> +	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
>>>>> +	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
>>>>> +	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
>>>>> +	emit_insn (gen_<optab>di3 (t1, t1, t2));
>>>>> +	emit_move_insn (tdi, t1);
>>>>> +      }
>>>>> +    else
>>>>> +      {
>>>>> +	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
>>>>> +	emit_move_insn (tdi, op1_lo);
>>>>> +      }
>>>>> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
>>>>> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
>>>>> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
>>>>> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
>>>>> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
>>>>> +    DONE;
>>>>> +  }
>>>>> +)
>>>>
>>>> Sorry to be awkward, but I've got mixed feelings about this.
>>>> The sequence produced looks good.  But we're not defining this
>>>> optab because the target has a special instruction for implementing
>>>> the operation.  We're just implementing it to tweak the loop emitted
>>>> by vect_create_epilog_for_reduction.  Maybe we should change that loop
>>>> instead, perhaps with a target hook?
>>>
>>> So does it currently not work because there's no v8qi, v4qi or v2qi
>>> vector modes?  I'll note there already _is_
>>> targetm.vectorize.split_reduction (mode), but it expects a vector
>>> mode as result and it doesn't get the operation in.
>>>
>>
>> The point of the reduction is that they shouldn't be done on the vector side.
>> So even if we have v4qi and v2qi the reduction shouldn't be done there as
>> the latency and throughput for these sub 64-bit operations on the vector
>> side is much higher and so should be done on the integer side. i.e. these
>> should be done on DI and SI.
>>
>> This is why I disagree with Richard that this can be done generically for all
>> targets. The decomposition takes advantage of Arm's fused shifted logical
>> operations on GPR.  This makes them really cheap to do on integer.
> 
> Wouldn't it still be a win without that though?  Having the fused operation
> gives us a ~4x improvement in latency on a typical core (EXT + vector AND
> -> fused scalar AND).  But the unfused operation would still be a ~2x
> improvement.
That was my thought as well.  We've got a scalar ALUs that will easily 
outrun a vector unit for small vectors, even without any magic fusions. 
So for something like 4qi vectors, if we can do them in GPRs, then it's 
likely a win.

jeff
Jennifer Schmitz Oct. 11, 2024, 2:42 p.m. UTC | #10
> On 11 Oct 2024, at 12:08, Richard Sandiford <richard.sandiford@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
>>> -----Original Message-----
>>> From: Richard Biener <rguenther@suse.de>
>>> Sent: Friday, October 11, 2024 7:52 AM
>>> To: Richard Sandiford <Richard.Sandiford@arm.com>
>>> Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
>>> Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
>>> <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
>>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>>> reductions
>>> 
>>> On Thu, 10 Oct 2024, Richard Sandiford wrote:
>>> 
>>>> Jennifer Schmitz <jschmitz@nvidia.com> writes:
>>>>> This patch implements the optabs reduc_and_scal_<mode>,
>>>>> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
>>>>> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
>>> logical
>>>>> vector reduction operations.
>>>>> Previously, either only vector registers or only general purpose registers (GPR)
>>>>> were used. Now, vector registers are used for the reduction from 128 to 64
>>> bits;
>>>>> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
>>> used
>>>>> for the rest of the reduction steps.
>>>>> 
>>>>> For example, the test case (V8HI)
>>>>> int16_t foo (int16_t *a)
>>>>> {
>>>>>  int16_t b = -1;
>>>>>  for (int i = 0; i < 8; ++i)
>>>>>    b &= a[i];
>>>>>  return b;
>>>>> }
>>>>> 
>>>>> was previously compiled to (-O2):
>>>>> foo:
>>>>> ldr     q0, [x0]
>>>>> movi    v30.4s, 0
>>>>> ext     v29.16b, v0.16b, v30.16b, #8
>>>>> and     v29.16b, v29.16b, v0.16b
>>>>> ext     v31.16b, v29.16b, v30.16b, #4
>>>>> and     v31.16b, v31.16b, v29.16b
>>>>> ext     v30.16b, v31.16b, v30.16b, #2
>>>>> and     v30.16b, v30.16b, v31.16b
>>>>> umov    w0, v30.h[0]
>>>>> ret
>>>>> 
>>>>> With patch, it is compiled to:
>>>>> foo:
>>>>> ldr     q31, [x0]
>>>>> ext     v30.16b, v31.16b, v31.16b, #8
>>>>> and     v31.8b, v30.8b, v31.8b
>>>>> fmov    x0, d31
>>>>> and     x0, x0, x0, lsr 32
>>>>> and     w0, w0, w0, lsr 16
>>>>> ret
>>>>> 
>>>>> For modes V4SI and V2DI, the pattern was not implemented, because the
>>>>> current codegen (using only base instructions) is already efficient.
>>>>> 
>>>>> Note that the PR initially suggested to use SVE reduction ops. However,
>>>>> they have higher latency than the proposed sequence, which is why using
>>>>> neon and base instructions is preferable.
>>>>> 
>>>>> Test cases were added for 8/16-bit integers for all implemented modes and all
>>>>> three operations to check the produced assembly.
>>>>> 
>>>>> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
>>>>> because for aarch64 vector types, either the logical reduction optabs are
>>>>> implemented or the codegen for reduction operations is good as it is.
>>>>> This was motivated by failure of a scan-tree-dump directive in the test cases
>>>>> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>>>>> 
>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>>> regression.
>>>>> OK for mainline?
>>>>> 
>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>>> 
>>>>> gcc/
>>>>> PR target/113816
>>>>> * config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>>>>> Implement for logical bitwise operations for VDQV_E.
>>>>> 
>>>>> gcc/testsuite/
>>>>> PR target/113816
>>>>> * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>>>>> * gcc.target/aarch64/simd/logical_reduc.c: New test.
>>>>> * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
>>>>> ---
>>>>> gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>>>>> .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>>>>> .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>>>>> gcc/testsuite/lib/target-supports.exp         |   4 +-
>>>>> 4 files changed, 267 insertions(+), 2 deletions(-)
>>>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>>>>> 
>>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>> b/gcc/config/aarch64/aarch64-simd.md
>>>>> index 23c03a96371..00286b8b020 100644
>>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>>> @@ -3608,6 +3608,61 @@
>>>>>   }
>>>>> )
>>>>> 
>>>>> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
>>>>> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
>>>>> +;; on the two halves of the input.
>>>>> +;; If the input has 128 bits, the first operation is performed in vector
>>>>> +;; registers.  From 64 bits down, the reduction steps are performed in general
>>>>> +;; purpose registers.
>>>>> +;; For example, for V8HI and operation AND, the intended sequence is:
>>>>> +;; EXT      v1.16b, v0.16b, v0.16b, #8
>>>>> +;; AND      v0.8b, v1.8b, v0.8b
>>>>> +;; FMOV     x0, d0
>>>>> +;; AND      x0, x0, x0, 32
>>>>> +;; AND      w0, w0, w0, 16
>>>>> +;;
>>>>> +;; For V8QI and operation AND, the sequence is:
>>>>> +;; AND      x0, x0, x0, lsr 32
>>>>> +;; AND      w0, w0, w0, lsr, 16
>>>>> +;; AND      w0, w0, w0, lsr, 8
>>>>> +
>>>>> +(define_expand "reduc_<optab>_scal_<mode>"
>>>>> + [(match_operand:<VEL> 0 "register_operand")
>>>>> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
>>>>> +  "TARGET_SIMD"
>>>>> +  {
>>>>> +    rtx dst = operands[1];
>>>>> +    rtx tdi = gen_reg_rtx (DImode);
>>>>> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
>>>>> +    rtx op1_lo;
>>>>> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
>>>>> +      {
>>>>> +        rtx t0 = gen_reg_rtx (<MODE>mode);
>>>>> +        rtx t1 = gen_reg_rtx (DImode);
>>>>> +        rtx t2 = gen_reg_rtx (DImode);
>>>>> +        rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
>>>>> +        emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
>>>>> +        op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
>>>>> +        rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
>>>>> +        emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
>>>>> +        emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
>>>>> +        emit_insn (gen_<optab>di3 (t1, t1, t2));
>>>>> +        emit_move_insn (tdi, t1);
>>>>> +      }
>>>>> +    else
>>>>> +      {
>>>>> +        op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
>>>>> +        emit_move_insn (tdi, op1_lo);
>>>>> +      }
>>>>> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
>>>>> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
>>>>> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
>>>>> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
>>>>> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
>>>>> +    DONE;
>>>>> +  }
>>>>> +)
>>>> 
>>>> Sorry to be awkward, but I've got mixed feelings about this.
>>>> The sequence produced looks good.  But we're not defining this
>>>> optab because the target has a special instruction for implementing
>>>> the operation.  We're just implementing it to tweak the loop emitted
>>>> by vect_create_epilog_for_reduction.  Maybe we should change that loop
>>>> instead, perhaps with a target hook?
>>> 
>>> So does it currently not work because there's no v8qi, v4qi or v2qi
>>> vector modes?  I'll note there already _is_
>>> targetm.vectorize.split_reduction (mode), but it expects a vector
>>> mode as result and it doesn't get the operation in.
>>> 
>> 
>> The point of the reduction is that they shouldn't be done on the vector side.
>> So even if we have v4qi and v2qi the reduction shouldn't be done there as
>> the latency and throughput for these sub 64-bit operations on the vector
>> side is much higher and so should be done on the integer side. i.e. these
>> should be done on DI and SI.
>> 
>> This is why I disagree with Richard that this can be done generically for all
>> targets. The decomposition takes advantage of Arm's fused shifted logical
>> operations on GPR.  This makes them really cheap to do on integer.
> 
> Wouldn't it still be a win without that though?  Having the fused operation
> gives us a ~4x improvement in latency on a typical core (EXT + vector AND
> -> fused scalar AND).  But the unfused operation would still be a ~2x
> improvement.
> 
>> Richard brought up AArch32 as another example. I don't agree there either
>> Because AArch32's register file overlap means you don't need shifts.
>> The first step reduction can just be done by accessing the registers
>> as even/odd pairs on a smaller mode.
>> 
>> To get this codegen generically the targets must supports modes tie-able
>> between the vector and smaller vector size. i.e.  BIT_FIELD_REF on the
>> vector mode to a smaller mode must be cheap or free.
>> 
>> The decision for when you do the transfer from SIMD to GPR is highly
>> target specific, and the how as well.
>> 
>>> I would expect that we can improve on this existing machinery
>>> instead of inventing a new one.
>>> 
>> 
>> But we're not. We're just implementing the optab to the machinery
>> that already exists.
> 
> I don't think this kind of expansion is what optabs are there for though.
> I suppose it was in the pre-gimple days, when all optimisation was done
> on RTL.  (Although of course that also predates vectorisation.)  But now,
> I think the purpose of optabs is to advertise what the target can do via
> some form of direct h/w support.
> 
> The original patch (using ANDV) was an ideal use of optabs.  But I think
> the request to use a tweaked version of the current open-coded reduction
> scheme, although a good suggestion, also changes the way that we should
> go about it.
> 
>> I think we're just over engineering a new contributor's patch. Even if
>> we think there is some merit to do it generically, I don't see why we
>> shouldn't take this patch today.
> 
> Sorry if it comes across that way.
Hi everyone,
Thanks for the discussion around my patch. I will certainly be happy to look at alternative implementation locations for the optimization, also on the gimple level. 
Could you offer some more guidance which direction to pursue, whether to extend targetm.vectorize.split_reduction (mode) or to make changes to vect_create_epilog_for_reduction? I’m not familiar enough with these parts of the code yet to decide which one is more suitable.
Thanks and have a nice weekend,
Jennifer
> 
> Thanks,
> Richard
Richard Biener Oct. 11, 2024, 2:57 p.m. UTC | #11
> Am 11.10.2024 um 16:43 schrieb Jennifer Schmitz <jschmitz@nvidia.com>:
> 
> 
> 
>> On 11 Oct 2024, at 12:08, Richard Sandiford <richard.sandiford@arm.com> wrote:
>> 
>> External email: Use caution opening links or attachments
>> 
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>>>> -----Original Message-----
>>>> From: Richard Biener <rguenther@suse.de>
>>>> Sent: Friday, October 11, 2024 7:52 AM
>>>> To: Richard Sandiford <Richard.Sandiford@arm.com>
>>>> Cc: Jennifer Schmitz <jschmitz@nvidia.com>; gcc-patches@gcc.gnu.org; Richard
>>>> Earnshaw <Richard.Earnshaw@arm.com>; Kyrylo Tkachov
>>>> <ktkachov@nvidia.com>; Tamar Christina <Tamar.Christina@arm.com>
>>>> Subject: Re: [PATCH][PR113816] AArch64: Use SIMD+GPR for logical vector
>>>> reductions
>>>> 
>>>>> On Thu, 10 Oct 2024, Richard Sandiford wrote:
>>>>> 
>>>>>> Jennifer Schmitz <jschmitz@nvidia.com> writes:
>>>>>>> This patch implements the optabs reduc_and_scal_<mode>,
>>>>>>> reduc_ior_scal_<mode>, and reduc_xor_scal_<mode> for ASIMD modes V8QI,
>>>>>>> V16QI, V4HI, and V8HI for TARGET_SIMD to improve codegen for bitwise
>>>>> logical
>>>>>>> vector reduction operations.
>>>>>>> Previously, either only vector registers or only general purpose registers (GPR)
>>>>>>> were used. Now, vector registers are used for the reduction from 128 to 64
>>>>> bits;
>>>>>>> 64-bit GPR are used for the reduction from 64 to 32 bits; and 32-bit GPR are
>>>>> used
>>>>>>> for the rest of the reduction steps.
>>>>>>> 
>>>>>>> For example, the test case (V8HI)
>>>>>>> int16_t foo (int16_t *a)
>>>>>>> {
>>>>>>> int16_t b = -1;
>>>>>>> for (int i = 0; i < 8; ++i)
>>>>>>>   b &= a[i];
>>>>>>> return b;
>>>>>>> }
>>>>>>> 
>>>>>>> was previously compiled to (-O2):
>>>>>>> foo:
>>>>>>> ldr     q0, [x0]
>>>>>>> movi    v30.4s, 0
>>>>>>> ext     v29.16b, v0.16b, v30.16b, #8
>>>>>>> and     v29.16b, v29.16b, v0.16b
>>>>>>> ext     v31.16b, v29.16b, v30.16b, #4
>>>>>>> and     v31.16b, v31.16b, v29.16b
>>>>>>> ext     v30.16b, v31.16b, v30.16b, #2
>>>>>>> and     v30.16b, v30.16b, v31.16b
>>>>>>> umov    w0, v30.h[0]
>>>>>>> ret
>>>>>>> 
>>>>>>> With patch, it is compiled to:
>>>>>>> foo:
>>>>>>> ldr     q31, [x0]
>>>>>>> ext     v30.16b, v31.16b, v31.16b, #8
>>>>>>> and     v31.8b, v30.8b, v31.8b
>>>>>>> fmov    x0, d31
>>>>>>> and     x0, x0, x0, lsr 32
>>>>>>> and     w0, w0, w0, lsr 16
>>>>>>> ret
>>>>>>> 
>>>>>>> For modes V4SI and V2DI, the pattern was not implemented, because the
>>>>>>> current codegen (using only base instructions) is already efficient.
>>>>>>> 
>>>>>>> Note that the PR initially suggested to use SVE reduction ops. However,
>>>>>>> they have higher latency than the proposed sequence, which is why using
>>>>>>> neon and base instructions is preferable.
>>>>>>> 
>>>>>>> Test cases were added for 8/16-bit integers for all implemented modes and all
>>>>>>> three operations to check the produced assembly.
>>>>>>> 
>>>>>>> We also added [istarget aarch64*-*-*] to the selector vect_logical_reduc,
>>>>>>> because for aarch64 vector types, either the logical reduction optabs are
>>>>>>> implemented or the codegen for reduction operations is good as it is.
>>>>>>> This was motivated by failure of a scan-tree-dump directive in the test cases
>>>>>>> gcc.dg/vect/vect-reduc-or_1.c and gcc.dg/vect/vect-reduc-or_2.c.
>>>>>>> 
>>>>>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>>>>> regression.
>>>>>>> OK for mainline?
>>>>>>> 
>>>>>>> Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>
>>>>>>> 
>>>>>>> gcc/
>>>>>>> PR target/113816
>>>>>>> * config/aarch64/aarch64-simd.md (reduc_<optab>_scal_<mode>):
>>>>>>> Implement for logical bitwise operations for VDQV_E.
>>>>>>> 
>>>>>>> gcc/testsuite/
>>>>>>> PR target/113816
>>>>>>> * lib/target-supports.exp (vect_logical_reduc): Add aarch64*.
>>>>>>> * gcc.target/aarch64/simd/logical_reduc.c: New test.
>>>>>>> * gcc.target/aarch64/vect-reduc-or_1.c: Adjust expected outcome.
>>>>>>> ---
>>>>>>> gcc/config/aarch64/aarch64-simd.md            |  55 +++++
>>>>>>> .../gcc.target/aarch64/simd/logical_reduc.c   | 208 ++++++++++++++++++
>>>>>>> .../gcc.target/aarch64/vect-reduc-or_1.c      |   2 +-
>>>>>>> gcc/testsuite/lib/target-supports.exp         |   4 +-
>>>>>>> 4 files changed, 267 insertions(+), 2 deletions(-)
>>>>>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
>>>>>>> 
>>>>>>> diff --git a/gcc/config/aarch64/aarch64-simd.md
>>>>> b/gcc/config/aarch64/aarch64-simd.md
>>>>>>> index 23c03a96371..00286b8b020 100644
>>>>>>> --- a/gcc/config/aarch64/aarch64-simd.md
>>>>>>> +++ b/gcc/config/aarch64/aarch64-simd.md
>>>>>>> @@ -3608,6 +3608,61 @@
>>>>>>>  }
>>>>>>> )
>>>>>>> 
>>>>>>> +;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
>>>>>>> +;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
>>>>>>> +;; on the two halves of the input.
>>>>>>> +;; If the input has 128 bits, the first operation is performed in vector
>>>>>>> +;; registers.  From 64 bits down, the reduction steps are performed in general
>>>>>>> +;; purpose registers.
>>>>>>> +;; For example, for V8HI and operation AND, the intended sequence is:
>>>>>>> +;; EXT      v1.16b, v0.16b, v0.16b, #8
>>>>>>> +;; AND      v0.8b, v1.8b, v0.8b
>>>>>>> +;; FMOV     x0, d0
>>>>>>> +;; AND      x0, x0, x0, 32
>>>>>>> +;; AND      w0, w0, w0, 16
>>>>>>> +;;
>>>>>>> +;; For V8QI and operation AND, the sequence is:
>>>>>>> +;; AND      x0, x0, x0, lsr 32
>>>>>>> +;; AND      w0, w0, w0, lsr, 16
>>>>>>> +;; AND      w0, w0, w0, lsr, 8
>>>>>>> +
>>>>>>> +(define_expand "reduc_<optab>_scal_<mode>"
>>>>>>> + [(match_operand:<VEL> 0 "register_operand")
>>>>>>> +  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
>>>>>>> +  "TARGET_SIMD"
>>>>>>> +  {
>>>>>>> +    rtx dst = operands[1];
>>>>>>> +    rtx tdi = gen_reg_rtx (DImode);
>>>>>>> +    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
>>>>>>> +    rtx op1_lo;
>>>>>>> +    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
>>>>>>> +      {
>>>>>>> +        rtx t0 = gen_reg_rtx (<MODE>mode);
>>>>>>> +        rtx t1 = gen_reg_rtx (DImode);
>>>>>>> +        rtx t2 = gen_reg_rtx (DImode);
>>>>>>> +        rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
>>>>>>> +        emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
>>>>>>> +        op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
>>>>>>> +        rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
>>>>>>> +        emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
>>>>>>> +        emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
>>>>>>> +        emit_insn (gen_<optab>di3 (t1, t1, t2));
>>>>>>> +        emit_move_insn (tdi, t1);
>>>>>>> +      }
>>>>>>> +    else
>>>>>>> +      {
>>>>>>> +        op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
>>>>>>> +        emit_move_insn (tdi, op1_lo);
>>>>>>> +      }
>>>>>>> +    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
>>>>>>> +    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
>>>>>>> +    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
>>>>>>> +      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
>>>>>>> +    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
>>>>>>> +    DONE;
>>>>>>> +  }
>>>>>>> +)
>>>>>> 
>>>>>> Sorry to be awkward, but I've got mixed feelings about this.
>>>>>> The sequence produced looks good.  But we're not defining this
>>>>>> optab because the target has a special instruction for implementing
>>>>>> the operation.  We're just implementing it to tweak the loop emitted
>>>>>> by vect_create_epilog_for_reduction.  Maybe we should change that loop
>>>>>> instead, perhaps with a target hook?
>>>>> 
>>>>> So does it currently not work because there's no v8qi, v4qi or v2qi
>>>>> vector modes?  I'll note there already _is_
>>>>> targetm.vectorize.split_reduction (mode), but it expects a vector
>>>>> mode as result and it doesn't get the operation in.
>>>>> 
>>> 
>>> The point of the reduction is that they shouldn't be done on the vector side.
>>> So even if we have v4qi and v2qi the reduction shouldn't be done there as
>>> the latency and throughput for these sub 64-bit operations on the vector
>>> side is much higher and so should be done on the integer side. i.e. these
>>> should be done on DI and SI.
>>> 
>>> This is why I disagree with Richard that this can be done generically for all
>>> targets. The decomposition takes advantage of Arm's fused shifted logical
>>> operations on GPR.  This makes them really cheap to do on integer.
>> 
>> Wouldn't it still be a win without that though?  Having the fused operation
>> gives us a ~4x improvement in latency on a typical core (EXT + vector AND
>> -> fused scalar AND).  But the unfused operation would still be a ~2x
>> improvement.
>> 
>>> Richard brought up AArch32 as another example. I don't agree there either
>>> Because AArch32's register file overlap means you don't need shifts.
>>> The first step reduction can just be done by accessing the registers
>>> as even/odd pairs on a smaller mode.
>>> 
>>> To get this codegen generically the targets must supports modes tie-able
>>> between the vector and smaller vector size. i.e.  BIT_FIELD_REF on the
>>> vector mode to a smaller mode must be cheap or free.
>>> 
>>> The decision for when you do the transfer from SIMD to GPR is highly
>>> target specific, and the how as well.
>>> 
>>>> I would expect that we can improve on this existing machinery
>>>> instead of inventing a new one.
>>>> 
>>> 
>>> But we're not. We're just implementing the optab to the machinery
>>> that already exists.
>> 
>> I don't think this kind of expansion is what optabs are there for though.
>> I suppose it was in the pre-gimple days, when all optimisation was done
>> on RTL.  (Although of course that also predates vectorisation.)  But now,
>> I think the purpose of optabs is to advertise what the target can do via
>> some form of direct h/w support.
>> 
>> The original patch (using ANDV) was an ideal use of optabs.  But I think
>> the request to use a tweaked version of the current open-coded reduction
>> scheme, although a good suggestion, also changes the way that we should
>> go about it.
>> 
>>> I think we're just over engineering a new contributor's patch. Even if
>>> we think there is some merit to do it generically, I don't see why we
>>> shouldn't take this patch today.
>> 
>> Sorry if it comes across that way.
> Hi everyone,
> Thanks for the discussion around my patch. I will certainly be happy to look at alternative implementation locations for the optimization, also on the gimple level.
> Could you offer some more guidance which direction to pursue, whether to extend targetm.vectorize.split_reduction (mode) or to make changes to vect_create_epilog_for_reduction? I’m not familiar enough with these parts of the code yet to decide which one is more suitable.

You need to touch both, making the hook suitable and producing scalar modes.

Richard 

> Thanks and have a nice weekend,
> Jennifer
>> 
>> Thanks,
>> Richard
> 
>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 23c03a96371..00286b8b020 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3608,6 +3608,61 @@ 
   }
 )
 
+;; Emit a sequence for bitwise logical reductions over vectors for V8QI, V16QI,
+;; V4HI, and V8HI modes.  The reduction is achieved by iteratively operating
+;; on the two halves of the input.
+;; If the input has 128 bits, the first operation is performed in vector
+;; registers.  From 64 bits down, the reduction steps are performed in general
+;; purpose registers.
+;; For example, for V8HI and operation AND, the intended sequence is:
+;; EXT      v1.16b, v0.16b, v0.16b, #8
+;; AND      v0.8b, v1.8b, v0.8b
+;; FMOV     x0, d0
+;; AND      x0, x0, x0, 32
+;; AND      w0, w0, w0, 16
+;;
+;; For V8QI and operation AND, the sequence is:
+;; AND      x0, x0, x0, lsr 32
+;; AND      w0, w0, w0, lsr, 16
+;; AND      w0, w0, w0, lsr, 8
+
+(define_expand "reduc_<optab>_scal_<mode>"
+ [(match_operand:<VEL> 0 "register_operand")
+  (LOGICAL:VDQV_E (match_operand:VDQV_E 1 "register_operand"))]
+  "TARGET_SIMD"
+  {
+    rtx dst = operands[1];
+    rtx tdi = gen_reg_rtx (DImode);
+    rtx tsi = lowpart_subreg (SImode, tdi, DImode);
+    rtx op1_lo;
+    if (known_eq (GET_MODE_SIZE (<MODE>mode), 16))
+      {
+	rtx t0 = gen_reg_rtx (<MODE>mode);
+	rtx t1 = gen_reg_rtx (DImode);
+	rtx t2 = gen_reg_rtx (DImode);
+	rtx idx = GEN_INT (8 / GET_MODE_UNIT_SIZE (<MODE>mode));
+	emit_insn (gen_aarch64_ext<mode> (t0, dst, dst, idx));
+	op1_lo = lowpart_subreg (V2DImode, dst, <MODE>mode);
+	rtx t0_lo = lowpart_subreg (V2DImode, t0, <MODE>mode);
+	emit_insn (gen_aarch64_get_lanev2di (t1, op1_lo, GEN_INT (0)));
+	emit_insn (gen_aarch64_get_lanev2di (t2, t0_lo, GEN_INT (0)));
+	emit_insn (gen_<optab>di3 (t1, t1, t2));
+	emit_move_insn (tdi, t1);
+      }
+    else
+      {
+	op1_lo = lowpart_subreg (DImode, dst, <MODE>mode);
+	emit_move_insn (tdi, op1_lo);
+      }
+    emit_insn (gen_<optab>_lshrdi3 (tdi, tdi, GEN_INT (32), tdi));
+    emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (16), tsi));
+    if (known_eq (GET_MODE_UNIT_BITSIZE (<MODE>mode), 8))
+      emit_insn (gen_<optab>_lshrsi3 (tsi, tsi, GEN_INT (8), tsi));
+    emit_move_insn (operands[0], lowpart_subreg (<VEL>mode, tsi, SImode));
+    DONE;
+  }
+)
+
 (define_insn "aarch64_reduc_<optab>_internal<mode>"
  [(set (match_operand:VDQV_S 0 "register_operand" "=w")
        (unspec:VDQV_S [(match_operand:VDQV_S 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
new file mode 100644
index 00000000000..9508288b218
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/logical_reduc.c
@@ -0,0 +1,208 @@ 
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint.h>
+
+/*
+** fv16qi_and:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	and	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	and	x0, x0, x0, lsr 32
+**	and	w0, w0, w0, lsr 16
+**	and	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv16qi_and (int8_t *a)
+{
+  int8_t b = -1;
+  for (int i = 0; i < 16; ++i)
+    b &= a[i];
+  return b;
+}
+
+/*
+** fv8hi_and:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	and	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	and	x0, x0, x0, lsr 32
+**	and	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv8hi_and (int16_t *a)
+{
+  int16_t b = -1;
+  for (int i = 0; i < 8; ++i)
+    b &= a[i];
+  return b;
+}
+
+/*
+** fv16qi_or:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	orr	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	orr	x0, x0, x0, lsr 32
+**	orr	w0, w0, w0, lsr 16
+**	orr	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv16qi_or (int8_t *a)
+{
+  int8_t b = 0;
+  for (int i = 0; i < 16; ++i)
+    b |= a[i];
+  return b;
+}
+
+/*
+** fv8hi_or:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	orr	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	orr	x0, x0, x0, lsr 32
+**	orr	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv8hi_or (int16_t *a)
+{
+  int16_t b = 0;
+  for (int i = 0; i < 8; ++i)
+    b |= a[i];
+  return b;
+}
+
+/*
+** fv16qi_xor:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	eor	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	eor	x0, x0, x0, lsr 32
+**	eor	w0, w0, w0, lsr 16
+**	eor	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv16qi_xor (int8_t *a)
+{
+  int8_t b = 0;
+  for (int i = 0; i < 16; ++i)
+    b ^= a[i];
+  return b;
+}
+
+/*
+** fv8hi_xor:
+**	ldr	q([0-9]+), \[x0\]
+**	ext	v([0-9]+)\.16b, v\1\.16b, v\1\.16b, #8
+**	eor	v\1\.8b, v\2\.8b, v\1\.8b
+**	fmov	x0, d\1
+**	eor	x0, x0, x0, lsr 32
+**	eor	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv8hi_xor (int16_t *a)
+{
+  int16_t b = 0;
+  for (int i = 0; i < 8; ++i)
+    b ^= a[i];
+  return b;
+}
+
+/*
+** fv8qi_and:
+**	ldr	x0, \[x0\]
+**	and	x0, x0, x0, lsr 32
+**	and	w0, w0, w0, lsr 16
+**	and	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv8qi_and (int8_t *a)
+{
+  int8_t b = -1;
+  for (int i = 0; i < 8; ++i)
+    b &= a[i];
+  return b;
+}
+
+/*
+** fv4hi_and:
+**	ldr	x0, \[x0\]
+**	and	x0, x0, x0, lsr 32
+**	and	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv4hi_and (int16_t *a)
+{
+  int16_t b = -1;
+  for (int i = 0; i < 4; ++i)
+    b &= a[i];
+  return b;
+}
+
+/*
+** fv8qi_or:
+**	ldr	x0, \[x0\]
+**	orr	x0, x0, x0, lsr 32
+**	orr	w0, w0, w0, lsr 16
+**	orr	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv8qi_or (int8_t *a)
+{
+  int8_t b = 0;
+  for (int i = 0; i < 8; ++i)
+    b |= a[i];
+  return b;
+}
+
+/*
+** fv4hi_or:
+**	ldr	x0, \[x0\]
+**	orr	x0, x0, x0, lsr 32
+**	orr	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv4hi_or (int16_t *a)
+{
+  int16_t b = 0;
+  for (int i = 0; i < 4; ++i)
+    b |= a[i];
+  return b;
+}
+
+/*
+** fv8qi_xor:
+**	ldr	x0, \[x0\]
+**	eor	x0, x0, x0, lsr 32
+**	eor	w0, w0, w0, lsr 16
+**	eor	w0, w0, w0, lsr 8
+**	ret
+*/
+int8_t fv8qi_xor (int8_t *a)
+{
+  int8_t b = 0;
+  for (int i = 0; i < 8; ++i)
+    b ^= a[i];
+  return b;
+}
+
+/*
+** fv4hi_xor:
+**	ldr	x0, \[x0\]
+**	eor	x0, x0, x0, lsr 32
+**	eor	w0, w0, w0, lsr 16
+**	ret
+*/
+int16_t fv4hi_xor (int16_t *a)
+{
+  int16_t b = 0;
+  for (int i = 0; i < 4; ++i)
+    b ^= a[i];
+  return b;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
index 918822a7d00..70c4ca18094 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-reduc-or_1.c
@@ -32,4 +32,4 @@  main (unsigned char argc, char **argv)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump "Reduce using vector shifts" "vect" } } */
+/* { dg-final { scan-tree-dump "Reduce using direct vector reduction" "vect" } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 8f2afe866c7..44f737f15d0 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9564,7 +9564,9 @@  proc check_effective_target_vect_logical_reduc { } {
 		   || [istarget amdgcn-*-*]
 		   || [check_effective_target_riscv_v]
 		   || [check_effective_target_loongarch_sx]
-		   || [istarget i?86-*-*] || [istarget x86_64-*-*]}]
+		   || [istarget i?86-*-*]
+		   || [istarget x86_64-*-*]
+		   || [istarget aarch64*-*-*]}]
 }
 
 # Return 1 if the target supports the fold_extract_last optab.