Message ID | 20240903153259.3136111-3-saurabh.jha@arm.com |
---|---|
State | New |
Headers | show |
Series | aarch64: Add support for AdvSIMD faminmax. | expand |
<saurabh.jha@arm.com> writes: > The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and > mandatory from Armv9.5-a. It introduces instructions for computing the > floating point absolute maximum and minimum of the two vectors > element-wise. > > This patch adds code generation support for famax and famin in terms of > existing RTL operators. > > famax/famin is equivalent to first taking abs of the operands and then > taking smax/smin on the results of abs. > > famax/famin (a, b) = smax/smin (abs (a), abs (b)) > > This fusion of operators is only possible when -march=armv9-a+faminmax > flags are passed. We also need to pass -ffast-math flag; if we don't, > then a statement like > > c[i] = __builtin_fmaxf16 (a[i], b[i]); > > is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin). > > This code generation is only available on -O2 or -O3 as that is when > auto-vectorization is enabled. The comment in 1/2 about not having a specific neon_fp_aminmax<q> scheduling type would apply here too. It looks good otherwise, but: following on from the SVE review, could you also add tests to make sure that we don't fold vmax/vmaxnm/vmin/vminm calls with vabs calls? I think the code handles it correctly, but it would be worth having a test to "defend" the behavior. Thanks, Richard > > gcc/ChangeLog: > > * config/aarch64/aarch64-simd.md > (*aarch64_faminmax_fused): Instruction pattern for faminmax > codegen. > * config/aarch64/iterators.md: Attribute for faminmax codegen. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test. > * gcc.target/aarch64/simd/faminmax-codegen.c: New test. > --- > gcc/config/aarch64/aarch64-simd.md | 10 + > gcc/config/aarch64/iterators.md | 3 + > .../aarch64/simd/faminmax-codegen-no-flag.c | 217 ++++++++++++++++++ > .../aarch64/simd/faminmax-codegen.c | 197 ++++++++++++++++ > 4 files changed, 427 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c > > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md > index 7542c81ed91..8973cade488 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -9921,3 +9921,13 @@ > "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" > [(set_attr "type" "neon_fp_aminmax<q>")] > ) > + > +(define_insn "*aarch64_faminmax_fused" > + [(set (match_operand:VHSDF 0 "register_operand" "=w") > + (FMAXMIN:VHSDF > + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) > + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))] > + "TARGET_FAMINMAX" > + "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" > + [(set_attr "type" "neon_fp_aminmax<q>")] > +) > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index 17ac5e073aa..c2fcd18306e 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -4472,3 +4472,6 @@ > > (define_int_attr faminmax_uns_op > [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) > + > +(define_code_attr faminmax_op > + [(smax "famax") (smin "famin")]) > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c > new file mode 100644 > index 00000000000..d77f5a5d19f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c > @@ -0,0 +1,217 @@ > +/* { dg-do assemble} */ > +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +#include "arm_neon.h" > + > +#pragma GCC target "+nosve" > + > +/* > +** test_vamax_f16: > +** fabs v1.4h, v1.4h > +** fabs v0.4h, v0.4h > +** fmaxnm v0.4h, v0.4h, v1.4h > +** ret > +*/ > +float16x4_t > +test_vamax_f16 (float16x4_t a, float16x4_t b) > +{ > + int i; > + float16x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fmaxf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f16: > +** fabs v1.8h, v1.8h > +** fabs v0.8h, v0.8h > +** fmaxnm v0.8h, v0.8h, v1.8h > +** ret > +*/ > +float16x8_t > +test_vamaxq_f16 (float16x8_t a, float16x8_t b) > +{ > + int i; > + float16x8_t c; > + > + for (i = 0; i < 8; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fmaxf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamax_f32: > +** fabs v1.2s, v1.2s > +** fabs v0.2s, v0.2s > +** fmaxnm v0.2s, v0.2s, v1.2s > +** ret > +*/ > +float32x2_t > +test_vamax_f32 (float32x2_t a, float32x2_t b) > +{ > + int i; > + float32x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fmaxf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f32: > +** fabs v1.4s, v1.4s > +** fabs v0.4s, v0.4s > +** fmaxnm v0.4s, v0.4s, v1.4s > +** ret > +*/ > +float32x4_t > +test_vamaxq_f32 (float32x4_t a, float32x4_t b) > +{ > + int i; > + float32x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fmaxf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f64: > +** fabs v1.2d, v1.2d > +** fabs v0.2d, v0.2d > +** fmaxnm v0.2d, v0.2d, v1.2d > +** ret > +*/ > +float64x2_t > +test_vamaxq_f64 (float64x2_t a, float64x2_t b) > +{ > + int i; > + float64x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf64 (a[i]); > + b[i] = __builtin_fabsf64 (b[i]); > + c[i] = __builtin_fmaxf64 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamin_f16: > +** fabs v1.4h, v1.4h > +** fabs v0.4h, v0.4h > +** fminnm v0.4h, v0.4h, v1.4h > +** ret > +*/ > +float16x4_t > +test_vamin_f16 (float16x4_t a, float16x4_t b) > +{ > + int i; > + float16x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fminf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f16: > +** fabs v1.8h, v1.8h > +** fabs v0.8h, v0.8h > +** fminnm v0.8h, v0.8h, v1.8h > +** ret > +*/ > +float16x8_t > +test_vaminq_f16 (float16x8_t a, float16x8_t b) > +{ > + int i; > + float16x8_t c; > + > + for (i = 0; i < 8; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fminf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamin_f32: > +** fabs v1.2s, v1.2s > +** fabs v0.2s, v0.2s > +** fminnm v0.2s, v0.2s, v1.2s > +** ret > +*/ > +float32x2_t > +test_vamin_f32 (float32x2_t a, float32x2_t b) > +{ > + int i; > + float32x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fminf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f32: > +** fabs v1.4s, v1.4s > +** fabs v0.4s, v0.4s > +** fminnm v0.4s, v0.4s, v1.4s > +** ret > +*/ > +float32x4_t > +test_vaminq_f32 (float32x4_t a, float32x4_t b) > +{ > + int i; > + float32x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fminf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f64: > +** fabs v1.2d, v1.2d > +** fabs v0.2d, v0.2d > +** fminnm v0.2d, v0.2d, v1.2d > +** ret > +*/ > +float64x2_t > +test_vaminq_f64 (float64x2_t a, float64x2_t b) > +{ > + int i; > + float64x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf64 (a[i]); > + b[i] = __builtin_fabsf64 (b[i]); > + c[i] = __builtin_fminf64 (a[i], b[i]); > + } > + return c; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c > new file mode 100644 > index 00000000000..971386c0bf0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c > @@ -0,0 +1,197 @@ > +/* { dg-do assemble} */ > +/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */ > +/* { dg-final { check-function-bodies "**" "" } } */ > + > +#include "arm_neon.h" > + > +#pragma GCC target "+nosve" > + > +/* > +** test_vamax_f16: > +** famax v0.4h, v1.4h, v0.4h > +** ret > +*/ > +float16x4_t > +test_vamax_f16 (float16x4_t a, float16x4_t b) > +{ > + int i; > + float16x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fmaxf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f16: > +** famax v0.8h, v1.8h, v0.8h > +** ret > +*/ > +float16x8_t > +test_vamaxq_f16 (float16x8_t a, float16x8_t b) > +{ > + int i; > + float16x8_t c; > + > + for (i = 0; i < 8; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fmaxf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamax_f32: > +** famax v0.2s, v1.2s, v0.2s > +** ret > +*/ > +float32x2_t > +test_vamax_f32 (float32x2_t a, float32x2_t b) > +{ > + int i; > + float32x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fmaxf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f32: > +** famax v0.4s, v1.4s, v0.4s > +** ret > +*/ > +float32x4_t > +test_vamaxq_f32 (float32x4_t a, float32x4_t b) > +{ > + int i; > + float32x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fmaxf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamaxq_f64: > +** famax v0.2d, v1.2d, v0.2d > +** ret > +*/ > +float64x2_t > +test_vamaxq_f64 (float64x2_t a, float64x2_t b) > +{ > + int i; > + float64x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf64 (a[i]); > + b[i] = __builtin_fabsf64 (b[i]); > + c[i] = __builtin_fmaxf64 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamin_f16: > +** famin v0.4h, v1.4h, v0.4h > +** ret > +*/ > +float16x4_t > +test_vamin_f16 (float16x4_t a, float16x4_t b) > +{ > + int i; > + float16x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fminf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f16: > +** famin v0.8h, v1.8h, v0.8h > +** ret > +*/ > +float16x8_t > +test_vaminq_f16 (float16x8_t a, float16x8_t b) > +{ > + int i; > + float16x8_t c; > + > + for (i = 0; i < 8; ++i) { > + a[i] = __builtin_fabsf16 (a[i]); > + b[i] = __builtin_fabsf16 (b[i]); > + c[i] = __builtin_fminf16 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vamin_f32: > +** famin v0.2s, v1.2s, v0.2s > +** ret > +*/ > +float32x2_t > +test_vamin_f32 (float32x2_t a, float32x2_t b) > +{ > + int i; > + float32x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fminf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f32: > +** famin v0.4s, v1.4s, v0.4s > +** ret > +*/ > +float32x4_t > +test_vaminq_f32 (float32x4_t a, float32x4_t b) > +{ > + int i; > + float32x4_t c; > + > + for (i = 0; i < 4; ++i) { > + a[i] = __builtin_fabsf32 (a[i]); > + b[i] = __builtin_fabsf32 (b[i]); > + c[i] = __builtin_fminf32 (a[i], b[i]); > + } > + return c; > +} > + > +/* > +** test_vaminq_f64: > +** famin v0.2d, v1.2d, v0.2d > +** ret > +*/ > +float64x2_t > +test_vaminq_f64 (float64x2_t a, float64x2_t b) > +{ > + int i; > + float64x2_t c; > + > + for (i = 0; i < 2; ++i) { > + a[i] = __builtin_fabsf64 (a[i]); > + b[i] = __builtin_fabsf64 (b[i]); > + c[i] = __builtin_fminf64 (a[i], b[i]); > + } > + return c; > +}
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 7542c81ed91..8973cade488 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -9921,3 +9921,13 @@ "<faminmax_uns_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" [(set_attr "type" "neon_fp_aminmax<q>")] ) + +(define_insn "*aarch64_faminmax_fused" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FMAXMIN:VHSDF + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")) + (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"))))] + "TARGET_FAMINMAX" + "<faminmax_op>\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>" + [(set_attr "type" "neon_fp_aminmax<q>")] +) diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 17ac5e073aa..c2fcd18306e 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -4472,3 +4472,6 @@ (define_int_attr faminmax_uns_op [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")]) + +(define_code_attr faminmax_op + [(smax "famax") (smin "famin")]) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c new file mode 100644 index 00000000000..d77f5a5d19f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c @@ -0,0 +1,217 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fmaxnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fmaxnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fmaxnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fmaxnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fmaxnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fmaxf64 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f16: +** fabs v1.4h, v1.4h +** fabs v0.4h, v0.4h +** fminnm v0.4h, v0.4h, v1.4h +** ret +*/ +float16x4_t +test_vamin_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f16: +** fabs v1.8h, v1.8h +** fabs v0.8h, v0.8h +** fminnm v0.8h, v0.8h, v1.8h +** ret +*/ +float16x8_t +test_vaminq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f32: +** fabs v1.2s, v1.2s +** fabs v0.2s, v0.2s +** fminnm v0.2s, v0.2s, v1.2s +** ret +*/ +float32x2_t +test_vamin_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f32: +** fabs v1.4s, v1.4s +** fabs v0.4s, v0.4s +** fminnm v0.4s, v0.4s, v1.4s +** ret +*/ +float32x4_t +test_vaminq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f64: +** fabs v1.2d, v1.2d +** fabs v0.2d, v0.2d +** fminnm v0.2d, v0.2d, v1.2d +** ret +*/ +float64x2_t +test_vaminq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fminf64 (a[i], b[i]); + } + return c; +} diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c new file mode 100644 index 00000000000..971386c0bf0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c @@ -0,0 +1,197 @@ +/* { dg-do assemble} */ +/* { dg-additional-options "-O2 -ffast-math -march=armv9-a+faminmax" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +#pragma GCC target "+nosve" + +/* +** test_vamax_f16: +** famax v0.4h, v1.4h, v0.4h +** ret +*/ +float16x4_t +test_vamax_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f16: +** famax v0.8h, v1.8h, v0.8h +** ret +*/ +float16x8_t +test_vamaxq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fmaxf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamax_f32: +** famax v0.2s, v1.2s, v0.2s +** ret +*/ +float32x2_t +test_vamax_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f32: +** famax v0.4s, v1.4s, v0.4s +** ret +*/ +float32x4_t +test_vamaxq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fmaxf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vamaxq_f64: +** famax v0.2d, v1.2d, v0.2d +** ret +*/ +float64x2_t +test_vamaxq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fmaxf64 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f16: +** famin v0.4h, v1.4h, v0.4h +** ret +*/ +float16x4_t +test_vamin_f16 (float16x4_t a, float16x4_t b) +{ + int i; + float16x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f16: +** famin v0.8h, v1.8h, v0.8h +** ret +*/ +float16x8_t +test_vaminq_f16 (float16x8_t a, float16x8_t b) +{ + int i; + float16x8_t c; + + for (i = 0; i < 8; ++i) { + a[i] = __builtin_fabsf16 (a[i]); + b[i] = __builtin_fabsf16 (b[i]); + c[i] = __builtin_fminf16 (a[i], b[i]); + } + return c; +} + +/* +** test_vamin_f32: +** famin v0.2s, v1.2s, v0.2s +** ret +*/ +float32x2_t +test_vamin_f32 (float32x2_t a, float32x2_t b) +{ + int i; + float32x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f32: +** famin v0.4s, v1.4s, v0.4s +** ret +*/ +float32x4_t +test_vaminq_f32 (float32x4_t a, float32x4_t b) +{ + int i; + float32x4_t c; + + for (i = 0; i < 4; ++i) { + a[i] = __builtin_fabsf32 (a[i]); + b[i] = __builtin_fabsf32 (b[i]); + c[i] = __builtin_fminf32 (a[i], b[i]); + } + return c; +} + +/* +** test_vaminq_f64: +** famin v0.2d, v1.2d, v0.2d +** ret +*/ +float64x2_t +test_vaminq_f64 (float64x2_t a, float64x2_t b) +{ + int i; + float64x2_t c; + + for (i = 0; i < 2; ++i) { + a[i] = __builtin_fabsf64 (a[i]); + b[i] = __builtin_fabsf64 (b[i]); + c[i] = __builtin_fminf64 (a[i], b[i]); + } + return c; +}