diff mbox series

[v8,2/2] aarch64: Add codegen support for AdvSIMD faminmax

Message ID 20240903153259.3136111-3-saurabh.jha@arm.com
State New
Headers show
Series aarch64: Add support for AdvSIMD faminmax. | expand

Commit Message

Saurabh Jha Sept. 3, 2024, 3:32 p.m. UTC
The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and
mandatory from Armv9.5-a. It introduces instructions for computing the
floating point absolute maximum and minimum of the two vectors
element-wise.

This patch adds code generation support for famax and famin in terms of
existing RTL operators.

famax/famin is equivalent to first taking abs of the operands and then
taking smax/smin on the results of abs.

	famax/famin (a, b) = smax/smin (abs (a), abs (b))

This fusion of operators is only possible when -march=armv9-a+faminmax
flags are passed. We also need to pass -ffast-math flag; if we don't,
then a statement like

	c[i] = __builtin_fmaxf16 (a[i], b[i]);

is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin).

This code generation is only available on -O2 or -O3 as that is when
auto-vectorization is enabled.

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md
	(*aarch64_faminmax_fused): Instruction pattern for faminmax
	codegen.
	* config/aarch64/iterators.md: Attribute for faminmax codegen.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test.
	* gcc.target/aarch64/simd/faminmax-codegen.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            |  10 +
 gcc/config/aarch64/iterators.md               |   3 +
 .../aarch64/simd/faminmax-codegen-no-flag.c   | 217 ++++++++++++++++++
 .../aarch64/simd/faminmax-codegen.c           | 197 ++++++++++++++++
 4 files changed, 427 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c

Comments

Richard Sandiford Sept. 17, 2024, 5:44 p.m. UTC | #1
<saurabh.jha@arm.com> writes:
> The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and
> mandatory from Armv9.5-a. It introduces instructions for computing the
> floating point absolute maximum and minimum of the two vectors
> element-wise.
>
> This patch adds code generation support for famax and famin in terms of
> existing RTL operators.
>
> famax/famin is equivalent to first taking abs of the operands and then
> taking smax/smin on the results of abs.
>
> 	famax/famin (a, b) = smax/smin (abs (a), abs (b))
>
> This fusion of operators is only possible when -march=armv9-a+faminmax
> flags are passed. We also need to pass -ffast-math flag; if we don't,
> then a statement like
>
> 	c[i] = __builtin_fmaxf16 (a[i], b[i]);
>
> is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin).
>
> This code generation is only available on -O2 or -O3 as that is when
> auto-vectorization is enabled.

The comment in 1/2 about not having a specific neon_fp_aminmax<q>
scheduling type would apply here too.  It looks good otherwise,
but: following on from the SVE review, could you also add tests
to make sure that we don't fold vmax/vmaxnm/vmin/vminm calls with
vabs calls?  I think the code handles it correctly, but it would
be worth having a test to "defend" the behavior.

Thanks,
Richard

>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-simd.md
> 	(*aarch64_faminmax_fused): Instruction pattern for faminmax
> 	codegen.
> 	* config/aarch64/iterators.md: Attribute for faminmax codegen.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test.
> 	* gcc.target/aarch64/simd/faminmax-codegen.c: New test.
> ---
>  gcc/config/aarch64/aarch64-simd.md            |  10 +
>  gcc/config/aarch64/iterators.md               |   3 +
>  .../aarch64/simd/faminmax-codegen-no-flag.c   | 217 ++++++++++++++++++
>  .../aarch64/simd/faminmax-codegen.c           | 197 ++++++++++++++++
>  4 files changed, 427 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c
>
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 7542c81ed91..8973cade488 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -9921,3 +9921,13 @@
>    "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
>    [(set_attr "type" "neon_fp_aminmax<q>")]
>  )
> +
> +(define_insn "*aarch64_faminmax_fused"
> +  [(set (match_operand:VHSDF 0 "register_operand" "=w")
> +	(FMAXMIN:VHSDF
> +	  (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
> +	  (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))]
> +  "TARGET_FAMINMAX"
> +  "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
> +  [(set_attr "type" "neon_fp_aminmax<q>")]
> +)
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 17ac5e073aa..c2fcd18306e 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -4472,3 +4472,6 @@
>  
>  (define_int_attr faminmax_uns_op
>    [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
> +
> +(define_code_attr faminmax_op
> +  [(smax "famax") (smin "famin")])
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
> new file mode 100644
> index 00000000000..d77f5a5d19f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
> @@ -0,0 +1,217 @@
> +/* { dg-do assemble} */
> +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_neon.h"
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** test_vamax_f16:
> +**	fabs	v1.4h, v1.4h
> +**	fabs	v0.4h, v0.4h
> +**	fmaxnm	v0.4h, v0.4h, v1.4h
> +**	ret
> +*/
> +float16x4_t
> +test_vamax_f16 (float16x4_t a, float16x4_t b)
> +{
> +  int i;
> +  float16x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fmaxf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f16:
> +**	fabs	v1.8h, v1.8h
> +**	fabs	v0.8h, v0.8h
> +**	fmaxnm	v0.8h, v0.8h, v1.8h
> +**	ret
> +*/
> +float16x8_t
> +test_vamaxq_f16 (float16x8_t a, float16x8_t b)
> +{
> +  int i;
> +  float16x8_t c;
> +
> +  for (i = 0; i < 8; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fmaxf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamax_f32:
> +**	fabs	v1.2s, v1.2s
> +**	fabs	v0.2s, v0.2s
> +**	fmaxnm	v0.2s, v0.2s, v1.2s
> +**	ret
> +*/
> +float32x2_t
> +test_vamax_f32 (float32x2_t a, float32x2_t b)
> +{
> +  int i;
> +  float32x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fmaxf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f32:
> +**	fabs	v1.4s, v1.4s
> +**	fabs	v0.4s, v0.4s
> +**	fmaxnm	v0.4s, v0.4s, v1.4s
> +**	ret
> +*/
> +float32x4_t
> +test_vamaxq_f32 (float32x4_t a, float32x4_t b)
> +{
> +  int i;
> +  float32x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fmaxf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f64:
> +**	fabs	v1.2d, v1.2d
> +**	fabs	v0.2d, v0.2d
> +**	fmaxnm	v0.2d, v0.2d, v1.2d
> +**	ret
> +*/
> +float64x2_t
> +test_vamaxq_f64 (float64x2_t a, float64x2_t b)
> +{
> +  int i;
> +  float64x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf64 (a[i]);
> +    b[i] = __builtin_fabsf64 (b[i]);
> +    c[i] = __builtin_fmaxf64 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamin_f16:
> +**	fabs	v1.4h, v1.4h
> +**	fabs	v0.4h, v0.4h
> +**	fminnm	v0.4h, v0.4h, v1.4h
> +**	ret
> +*/
> +float16x4_t
> +test_vamin_f16 (float16x4_t a, float16x4_t b)
> +{
> +  int i;
> +  float16x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fminf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f16:
> +**	fabs	v1.8h, v1.8h
> +**	fabs	v0.8h, v0.8h
> +**	fminnm	v0.8h, v0.8h, v1.8h
> +**	ret
> +*/
> +float16x8_t
> +test_vaminq_f16 (float16x8_t a, float16x8_t b)
> +{
> +  int i;
> +  float16x8_t c;
> +
> +  for (i = 0; i < 8; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fminf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamin_f32:
> +**	fabs	v1.2s, v1.2s
> +**	fabs	v0.2s, v0.2s
> +**	fminnm	v0.2s, v0.2s, v1.2s
> +**	ret
> +*/
> +float32x2_t
> +test_vamin_f32 (float32x2_t a, float32x2_t b)
> +{
> +  int i;
> +  float32x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fminf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f32:
> +**	fabs	v1.4s, v1.4s
> +**	fabs	v0.4s, v0.4s
> +**	fminnm	v0.4s, v0.4s, v1.4s
> +**	ret
> +*/
> +float32x4_t
> +test_vaminq_f32 (float32x4_t a, float32x4_t b)
> +{
> +  int i;
> +  float32x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fminf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f64:
> +**	fabs	v1.2d, v1.2d
> +**	fabs	v0.2d, v0.2d
> +**	fminnm	v0.2d, v0.2d, v1.2d
> +**	ret
> +*/
> +float64x2_t
> +test_vaminq_f64 (float64x2_t a, float64x2_t b)
> +{
> +  int i;
> +  float64x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf64 (a[i]);
> +    b[i] = __builtin_fabsf64 (b[i]);
> +    c[i] = __builtin_fminf64 (a[i], b[i]);
> +  }
> +  return c;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c
> new file mode 100644
> index 00000000000..971386c0bf0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c
> @@ -0,0 +1,197 @@
> +/* { dg-do assemble} */
> +/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_neon.h"
> +
> +#pragma GCC target "+nosve"
> +
> +/*
> +** test_vamax_f16:
> +**	famax	v0.4h, v1.4h, v0.4h
> +**	ret
> +*/
> +float16x4_t
> +test_vamax_f16 (float16x4_t a, float16x4_t b)
> +{
> +  int i;
> +  float16x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fmaxf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f16:
> +**	famax	v0.8h, v1.8h, v0.8h
> +**	ret
> +*/
> +float16x8_t
> +test_vamaxq_f16 (float16x8_t a, float16x8_t b)
> +{
> +  int i;
> +  float16x8_t c;
> +
> +  for (i = 0; i < 8; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fmaxf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamax_f32:
> +**	famax	v0.2s, v1.2s, v0.2s
> +**	ret
> +*/
> +float32x2_t
> +test_vamax_f32 (float32x2_t a, float32x2_t b)
> +{
> +  int i;
> +  float32x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fmaxf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f32:
> +**	famax	v0.4s, v1.4s, v0.4s
> +**	ret
> +*/
> +float32x4_t
> +test_vamaxq_f32 (float32x4_t a, float32x4_t b)
> +{
> +  int i;
> +  float32x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fmaxf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamaxq_f64:
> +**	famax	v0.2d, v1.2d, v0.2d
> +**	ret
> +*/
> +float64x2_t
> +test_vamaxq_f64 (float64x2_t a, float64x2_t b)
> +{
> +  int i;
> +  float64x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf64 (a[i]);
> +    b[i] = __builtin_fabsf64 (b[i]);
> +    c[i] = __builtin_fmaxf64 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamin_f16:
> +**	famin	v0.4h, v1.4h, v0.4h
> +**	ret
> +*/
> +float16x4_t
> +test_vamin_f16 (float16x4_t a, float16x4_t b)
> +{
> +  int i;
> +  float16x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fminf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f16:
> +**	famin	v0.8h, v1.8h, v0.8h
> +**	ret
> +*/
> +float16x8_t
> +test_vaminq_f16 (float16x8_t a, float16x8_t b)
> +{
> +  int i;
> +  float16x8_t c;
> +
> +  for (i = 0; i < 8; ++i) {
> +    a[i] = __builtin_fabsf16 (a[i]);
> +    b[i] = __builtin_fabsf16 (b[i]);
> +    c[i] = __builtin_fminf16 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vamin_f32:
> +**	famin	v0.2s, v1.2s, v0.2s
> +**	ret
> +*/
> +float32x2_t
> +test_vamin_f32 (float32x2_t a, float32x2_t b)
> +{
> +  int i;
> +  float32x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fminf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f32:
> +**	famin	v0.4s, v1.4s, v0.4s
> +**	ret
> +*/
> +float32x4_t
> +test_vaminq_f32 (float32x4_t a, float32x4_t b)
> +{
> +  int i;
> +  float32x4_t c;
> +
> +  for (i = 0; i < 4; ++i) {
> +    a[i] = __builtin_fabsf32 (a[i]);
> +    b[i] = __builtin_fabsf32 (b[i]);
> +    c[i] = __builtin_fminf32 (a[i], b[i]);
> +  }
> +  return c;
> +}
> +
> +/*
> +** test_vaminq_f64:
> +**	famin	v0.2d, v1.2d, v0.2d
> +**	ret
> +*/
> +float64x2_t
> +test_vaminq_f64 (float64x2_t a, float64x2_t b)
> +{
> +  int i;
> +  float64x2_t c;
> +
> +  for (i = 0; i < 2; ++i) {
> +    a[i] = __builtin_fabsf64 (a[i]);
> +    b[i] = __builtin_fabsf64 (b[i]);
> +    c[i] = __builtin_fminf64 (a[i], b[i]);
> +  }
> +  return c;
> +}
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7542c81ed91..8973cade488 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9921,3 +9921,13 @@ 
   "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_fp_aminmax<q>")]
 )
+
+(define_insn "*aarch64_faminmax_fused"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(FMAXMIN:VHSDF
+	  (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
+	  (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))]
+  "TARGET_FAMINMAX"
+  "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_fp_aminmax<q>")]
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 17ac5e073aa..c2fcd18306e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -4472,3 +4472,6 @@ 
 
 (define_int_attr faminmax_uns_op
   [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
+
+(define_code_attr faminmax_op
+  [(smax "famax") (smin "famin")])
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
new file mode 100644
index 00000000000..d77f5a5d19f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
@@ -0,0 +1,217 @@ 
+/* { dg-do assemble} */
+/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+#pragma GCC target "+nosve"
+
+/*
+** test_vamax_f16:
+**	fabs	v1.4h, v1.4h
+**	fabs	v0.4h, v0.4h
+**	fmaxnm	v0.4h, v0.4h, v1.4h
+**	ret
+*/
+float16x4_t
+test_vamax_f16 (float16x4_t a, float16x4_t b)
+{
+  int i;
+  float16x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f16:
+**	fabs	v1.8h, v1.8h
+**	fabs	v0.8h, v0.8h
+**	fmaxnm	v0.8h, v0.8h, v1.8h
+**	ret
+*/
+float16x8_t
+test_vamaxq_f16 (float16x8_t a, float16x8_t b)
+{
+  int i;
+  float16x8_t c;
+
+  for (i = 0; i < 8; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamax_f32:
+**	fabs	v1.2s, v1.2s
+**	fabs	v0.2s, v0.2s
+**	fmaxnm	v0.2s, v0.2s, v1.2s
+**	ret
+*/
+float32x2_t
+test_vamax_f32 (float32x2_t a, float32x2_t b)
+{
+  int i;
+  float32x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f32:
+**	fabs	v1.4s, v1.4s
+**	fabs	v0.4s, v0.4s
+**	fmaxnm	v0.4s, v0.4s, v1.4s
+**	ret
+*/
+float32x4_t
+test_vamaxq_f32 (float32x4_t a, float32x4_t b)
+{
+  int i;
+  float32x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f64:
+**	fabs	v1.2d, v1.2d
+**	fabs	v0.2d, v0.2d
+**	fmaxnm	v0.2d, v0.2d, v1.2d
+**	ret
+*/
+float64x2_t
+test_vamaxq_f64 (float64x2_t a, float64x2_t b)
+{
+  int i;
+  float64x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf64 (a[i]);
+    b[i] = __builtin_fabsf64 (b[i]);
+    c[i] = __builtin_fmaxf64 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamin_f16:
+**	fabs	v1.4h, v1.4h
+**	fabs	v0.4h, v0.4h
+**	fminnm	v0.4h, v0.4h, v1.4h
+**	ret
+*/
+float16x4_t
+test_vamin_f16 (float16x4_t a, float16x4_t b)
+{
+  int i;
+  float16x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fminf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f16:
+**	fabs	v1.8h, v1.8h
+**	fabs	v0.8h, v0.8h
+**	fminnm	v0.8h, v0.8h, v1.8h
+**	ret
+*/
+float16x8_t
+test_vaminq_f16 (float16x8_t a, float16x8_t b)
+{
+  int i;
+  float16x8_t c;
+
+  for (i = 0; i < 8; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fminf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamin_f32:
+**	fabs	v1.2s, v1.2s
+**	fabs	v0.2s, v0.2s
+**	fminnm	v0.2s, v0.2s, v1.2s
+**	ret
+*/
+float32x2_t
+test_vamin_f32 (float32x2_t a, float32x2_t b)
+{
+  int i;
+  float32x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fminf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f32:
+**	fabs	v1.4s, v1.4s
+**	fabs	v0.4s, v0.4s
+**	fminnm	v0.4s, v0.4s, v1.4s
+**	ret
+*/
+float32x4_t
+test_vaminq_f32 (float32x4_t a, float32x4_t b)
+{
+  int i;
+  float32x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fminf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f64:
+**	fabs	v1.2d, v1.2d
+**	fabs	v0.2d, v0.2d
+**	fminnm	v0.2d, v0.2d, v1.2d
+**	ret
+*/
+float64x2_t
+test_vaminq_f64 (float64x2_t a, float64x2_t b)
+{
+  int i;
+  float64x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf64 (a[i]);
+    b[i] = __builtin_fabsf64 (b[i]);
+    c[i] = __builtin_fminf64 (a[i], b[i]);
+  }
+  return c;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c
new file mode 100644
index 00000000000..971386c0bf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c
@@ -0,0 +1,197 @@ 
+/* { dg-do assemble} */
+/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+#pragma GCC target "+nosve"
+
+/*
+** test_vamax_f16:
+**	famax	v0.4h, v1.4h, v0.4h
+**	ret
+*/
+float16x4_t
+test_vamax_f16 (float16x4_t a, float16x4_t b)
+{
+  int i;
+  float16x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f16:
+**	famax	v0.8h, v1.8h, v0.8h
+**	ret
+*/
+float16x8_t
+test_vamaxq_f16 (float16x8_t a, float16x8_t b)
+{
+  int i;
+  float16x8_t c;
+
+  for (i = 0; i < 8; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamax_f32:
+**	famax	v0.2s, v1.2s, v0.2s
+**	ret
+*/
+float32x2_t
+test_vamax_f32 (float32x2_t a, float32x2_t b)
+{
+  int i;
+  float32x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f32:
+**	famax	v0.4s, v1.4s, v0.4s
+**	ret
+*/
+float32x4_t
+test_vamaxq_f32 (float32x4_t a, float32x4_t b)
+{
+  int i;
+  float32x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f64:
+**	famax	v0.2d, v1.2d, v0.2d
+**	ret
+*/
+float64x2_t
+test_vamaxq_f64 (float64x2_t a, float64x2_t b)
+{
+  int i;
+  float64x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf64 (a[i]);
+    b[i] = __builtin_fabsf64 (b[i]);
+    c[i] = __builtin_fmaxf64 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamin_f16:
+**	famin	v0.4h, v1.4h, v0.4h
+**	ret
+*/
+float16x4_t
+test_vamin_f16 (float16x4_t a, float16x4_t b)
+{
+  int i;
+  float16x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fminf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f16:
+**	famin	v0.8h, v1.8h, v0.8h
+**	ret
+*/
+float16x8_t
+test_vaminq_f16 (float16x8_t a, float16x8_t b)
+{
+  int i;
+  float16x8_t c;
+
+  for (i = 0; i < 8; ++i) {
+    a[i] = __builtin_fabsf16 (a[i]);
+    b[i] = __builtin_fabsf16 (b[i]);
+    c[i] = __builtin_fminf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamin_f32:
+**	famin	v0.2s, v1.2s, v0.2s
+**	ret
+*/
+float32x2_t
+test_vamin_f32 (float32x2_t a, float32x2_t b)
+{
+  int i;
+  float32x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fminf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f32:
+**	famin	v0.4s, v1.4s, v0.4s
+**	ret
+*/
+float32x4_t
+test_vaminq_f32 (float32x4_t a, float32x4_t b)
+{
+  int i;
+  float32x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+    a[i] = __builtin_fabsf32 (a[i]);
+    b[i] = __builtin_fabsf32 (b[i]);
+    c[i] = __builtin_fminf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vaminq_f64:
+**	famin	v0.2d, v1.2d, v0.2d
+**	ret
+*/
+float64x2_t
+test_vaminq_f64 (float64x2_t a, float64x2_t b)
+{
+  int i;
+  float64x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+    a[i] = __builtin_fabsf64 (a[i]);
+    b[i] = __builtin_fabsf64 (b[i]);
+    c[i] = __builtin_fminf64 (a[i], b[i]);
+  }
+  return c;
+}