Message ID | 20240722093011.3115878-4-claudio.bantaloukas@arm.com |
---|---|
State | New |
Headers | show |
Series | aarch64: Add march flags for +fp8 arch extensions | expand |
Hi Claudio, > On 22 Jul 2024, at 11:30, Claudio Bantaloukas <claudio.bantaloukas@arm.com> wrote: > > External email: Use caution opening links or attachments > > > The ACLE declares several helper types and functions to > facilitate construction of `fpm` arguments. > > gcc/ChangeLog: > > * config/aarch64/arm_acle.h (fpm_t): New type representing fpmr values. > (enum __ARM_FPM_FORMAT): New enum representing valid fp8 formats. > (enum __ARM_FPM_OVERFLOW): New enum representing how some fp8 > calculations work. > (arm_fpm_init): New. > (arm_set_fpm_src1_format): Likewise. > (arm_set_fpm_src2_format): Likewise. > (arm_set_fpm_dst_format): Likewise. > (arm_set_fpm_overflow_cvt): Likewise. > (arm_set_fpm_overflow_mul): Likewise. > (arm_set_fpm_lscale): Likewise. > (arm_set_fpm_lscale2): Likewise. > (arm_set_fpm_nscale): Likewise. According to the FP8 pull request in ACLE: https://github.com/ARM-software/acle/pull/323/files These functions should all be in the implementation namespace, that is they should begin with __arm*. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/acle/fp8-helpers.c: New test of fpmr helper functions. > --- > gcc/config/aarch64/arm_acle.h | 37 +++++++++++++ > .../gcc.target/aarch64/acle/fp8-helpers.c | 52 +++++++++++++++++++ > 2 files changed, 89 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c > > diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h > index 2aa681090fa..4f0c600582a 100644 > --- a/gcc/config/aarch64/arm_acle.h > +++ b/gcc/config/aarch64/arm_acle.h > @@ -385,6 +385,43 @@ __rndrrs (uint64_t *__res) > > #pragma GCC pop_options > > +#ifdef __ARM_FEATURE_FP8 > + > +typedef uint64_t fpm_t; > + > +enum __ARM_FPM_FORMAT > +{ > + __ARM_FPM_E5M2, > + __ARM_FPM_E4M3, > +}; > + > +enum __ARM_FPM_OVERFLOW > +{ > + __ARM_FPM_INFNAN, > + __ARM_FPM_SATURATE, > +}; > + > +#define arm_fpm_init() (0) > + > +#define arm_set_fpm_src1_format(__fpm, __format) \ > + ((__fpm & ~(uint64_t)0x7) | (__format & (uint64_t)0x7)) > +#define arm_set_fpm_src2_format(__fpm, __format) \ > + ((__fpm & ~((uint64_t)0x7 << 3)) | ((__format & (uint64_t)0x7) << 3)) > +#define arm_set_fpm_dst_format(__fpm, __format) \ > + ((__fpm & ~((uint64_t)0x7 << 6)) | ((__format & (uint64_t)0x7) << 6)) > +#define arm_set_fpm_overflow_cvt(__fpm, __behaviour) \ > + ((__fpm & ~((uint64_t)0x1 << 15)) | ((__behaviour & (uint64_t)0x1) << 15)) > +#define arm_set_fpm_overflow_mul(__fpm, __behaviour) \ > + ((__fpm & ~((uint64_t)0x1 << 14)) | ((__behaviour & (uint64_t)0x1) << 14)) > +#define arm_set_fpm_lscale(__fpm, __scale) \ > + ((__fpm & ~((uint64_t)0x7f << 16)) | ((__scale & (uint64_t)0x7f) << 16)) > +#define arm_set_fpm_lscale2(__fpm, __scale) \ > + ((__fpm & ~((uint64_t)0x3f << 32)) | ((__scale & (uint64_t)0x3f) << 32)) > +#define arm_set_fpm_nscale(__fpm, __scale) \ > + ((__fpm & ~((uint64_t)0xff << 24)) | ((__scale & (uint64_t)0xff) << 24)) > + I’d feel more comfortable if these were implemented as proper intrinsics in aarch64-builtins.cc <http://aarch64-builtins.cc/> rather than open-coding them like that. For one it would guarantee tighter control over the code generation. This version would likely fluctuate a lot based on optimization level, evolution of unrelated passes in the mined and other things. Additionally, the ACLE mandates certain ranges for the scale parameter in these intrinsics. Having them as builtins would allow us to add validation code to help the user here. > +#endif > + > #ifdef __cplusplus > } > #endif > diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c > new file mode 100644 > index 00000000000..e654326c387 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c > @@ -0,0 +1,52 @@ > +/* Test the fp8 ACLE helper functions. */ > +/* { dg-do compile } */ > +/* { dg-options "-std=c90 -pedantic-errors -O1 -march=armv9.4-a+fp8" } */ > + > +#include <arm_acle.h> > + > +void > +test_prepare_fpmr_sysreg () > +{ > + > +#define _S_EQ(expr, expected) \ > + _Static_assert (expr == expected, #expr " == " #expected) > + > + _S_EQ (arm_fpm_init (), 0); > + > + /* Bits [2:0] */ > + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); > + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x1); > + > + /* Bits [5:3] */ > + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); > + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x8); > + > + /* Bits [8:6] */ > + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); > + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x40); > + > + /* Bit 14 */ > + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_INFNAN), 0); > + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_SATURATE), > + 0x4000); > + > + /* Bit 15 */ > + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_INFNAN), 0); > + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_SATURATE), > + 0x8000); > + > + /* Bits [22:16] */ > + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 0), 0); > + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 127), 0x7F0000); > + > + /* Bits [37:32] */ > + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 0), 0); > + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 63), 0x3F00000000); > + > + /* Bits [31:24] */ > + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 0), 0); > + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 127), 0x7F000000); > + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), -128), 0x80000000); > + Given my comment above, I’d expect to see some code generation tests as well to check that the FPM modification functions above generate tight code. Thanks, Kyrill > +#undef _S_EQ > +}
Kyrylo Tkachov <ktkachov@nvidia.com> writes: > Hi Claudio, > >> On 22 Jul 2024, at 11:30, Claudio Bantaloukas <claudio.bantaloukas@arm.com> wrote: >> >> External email: Use caution opening links or attachments >> >> >> The ACLE declares several helper types and functions to >> facilitate construction of `fpm` arguments. >> >> gcc/ChangeLog: >> >> * config/aarch64/arm_acle.h (fpm_t): New type representing fpmr values. >> (enum __ARM_FPM_FORMAT): New enum representing valid fp8 formats. >> (enum __ARM_FPM_OVERFLOW): New enum representing how some fp8 >> calculations work. >> (arm_fpm_init): New. >> (arm_set_fpm_src1_format): Likewise. >> (arm_set_fpm_src2_format): Likewise. >> (arm_set_fpm_dst_format): Likewise. >> (arm_set_fpm_overflow_cvt): Likewise. >> (arm_set_fpm_overflow_mul): Likewise. >> (arm_set_fpm_lscale): Likewise. >> (arm_set_fpm_lscale2): Likewise. >> (arm_set_fpm_nscale): Likewise. > > According to the FP8 pull request in ACLE: > https://github.com/ARM-software/acle/pull/323/files > These functions should all be in the implementation namespace, that is they should begin with __arm*. > >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/aarch64/acle/fp8-helpers.c: New test of fpmr helper functions. >> --- >> gcc/config/aarch64/arm_acle.h | 37 +++++++++++++ >> .../gcc.target/aarch64/acle/fp8-helpers.c | 52 +++++++++++++++++++ >> 2 files changed, 89 insertions(+) >> create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >> >> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h >> index 2aa681090fa..4f0c600582a 100644 >> --- a/gcc/config/aarch64/arm_acle.h >> +++ b/gcc/config/aarch64/arm_acle.h >> @@ -385,6 +385,43 @@ __rndrrs (uint64_t *__res) >> >> #pragma GCC pop_options >> >> +#ifdef __ARM_FEATURE_FP8 The definitions should be unconditional, so that they're still available if someone initially compiles without fp8 but enables it later in the translation unit. >> + >> +typedef uint64_t fpm_t; >> + >> +enum __ARM_FPM_FORMAT >> +{ >> + __ARM_FPM_E5M2, >> + __ARM_FPM_E4M3, >> +}; >> + >> +enum __ARM_FPM_OVERFLOW >> +{ >> + __ARM_FPM_INFNAN, >> + __ARM_FPM_SATURATE, >> +}; >> + >> +#define arm_fpm_init() (0) >> + >> +#define arm_set_fpm_src1_format(__fpm, __format) \ >> + ((__fpm & ~(uint64_t)0x7) | (__format & (uint64_t)0x7)) >> +#define arm_set_fpm_src2_format(__fpm, __format) \ >> + ((__fpm & ~((uint64_t)0x7 << 3)) | ((__format & (uint64_t)0x7) << 3)) >> +#define arm_set_fpm_dst_format(__fpm, __format) \ >> + ((__fpm & ~((uint64_t)0x7 << 6)) | ((__format & (uint64_t)0x7) << 6)) >> +#define arm_set_fpm_overflow_cvt(__fpm, __behaviour) \ >> + ((__fpm & ~((uint64_t)0x1 << 15)) | ((__behaviour & (uint64_t)0x1) << 15)) >> +#define arm_set_fpm_overflow_mul(__fpm, __behaviour) \ >> + ((__fpm & ~((uint64_t)0x1 << 14)) | ((__behaviour & (uint64_t)0x1) << 14)) Nit: should use the international (= US) spelling "behavior". >> +#define arm_set_fpm_lscale(__fpm, __scale) \ >> + ((__fpm & ~((uint64_t)0x7f << 16)) | ((__scale & (uint64_t)0x7f) << 16)) >> +#define arm_set_fpm_lscale2(__fpm, __scale) \ >> + ((__fpm & ~((uint64_t)0x3f << 32)) | ((__scale & (uint64_t)0x3f) << 32)) >> +#define arm_set_fpm_nscale(__fpm, __scale) \ >> + ((__fpm & ~((uint64_t)0xff << 24)) | ((__scale & (uint64_t)0xff) << 24)) >> + > > I’d feel more comfortable if these were implemented as proper intrinsics in aarch64-builtins.cc <http://aarch64-builtins.cc/> rather than open-coding them like that. For one it would guarantee tighter control over the code generation. > This version would likely fluctuate a lot based on optimization level, evolution of unrelated passes in the mined and other things. > Additionally, the ACLE mandates certain ranges for the scale parameter in these intrinsics. Having them as builtins would allow us to add validation code to help the user here. These helpers don't map to specific FP8 instructions though. They're just an abstraction and an aid to the programmer, so that people don't have to write hard-coded masks or their own shift/insert code. There's no expectation that a given macro/function will produce a given code sequence. Also, the parameters are allowed to be variable, so I don't think the ranges are something that the compiler should enforce directly. Thanks, Richard > >> +#endif >> + >> #ifdef __cplusplus >> } >> #endif >> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >> new file mode 100644 >> index 00000000000..e654326c387 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >> @@ -0,0 +1,52 @@ >> +/* Test the fp8 ACLE helper functions. */ >> +/* { dg-do compile } */ >> +/* { dg-options "-std=c90 -pedantic-errors -O1 -march=armv9.4-a+fp8" } */ >> + >> +#include <arm_acle.h> >> + >> +void >> +test_prepare_fpmr_sysreg () >> +{ >> + >> +#define _S_EQ(expr, expected) \ >> + _Static_assert (expr == expected, #expr " == " #expected) >> + >> + _S_EQ (arm_fpm_init (), 0); >> + >> + /* Bits [2:0] */ >> + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >> + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x1); >> + >> + /* Bits [5:3] */ >> + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >> + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x8); >> + >> + /* Bits [8:6] */ >> + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >> + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x40); >> + >> + /* Bit 14 */ >> + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_INFNAN), 0); >> + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_SATURATE), >> + 0x4000); >> + >> + /* Bit 15 */ >> + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_INFNAN), 0); >> + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_SATURATE), >> + 0x8000); >> + >> + /* Bits [22:16] */ >> + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 0), 0); >> + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 127), 0x7F0000); >> + >> + /* Bits [37:32] */ >> + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 0), 0); >> + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 63), 0x3F00000000); >> + >> + /* Bits [31:24] */ >> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 0), 0); >> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 127), 0x7F000000); >> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), -128), 0x80000000); >> + > > Given my comment above, I’d expect to see some code generation tests as well to check that the FPM modification functions above generate tight code. > > Thanks, > Kyrill > > >> +#undef _S_EQ >> +}
> On 22 Jul 2024, at 17:16, Richard Sandiford <richard.sandiford@arm.com> wrote: > > External email: Use caution opening links or attachments > > > Kyrylo Tkachov <ktkachov@nvidia.com> writes: >> Hi Claudio, >> >>> On 22 Jul 2024, at 11:30, Claudio Bantaloukas <claudio.bantaloukas@arm.com> wrote: >>> >>> External email: Use caution opening links or attachments >>> >>> >>> The ACLE declares several helper types and functions to >>> facilitate construction of `fpm` arguments. >>> >>> gcc/ChangeLog: >>> >>> * config/aarch64/arm_acle.h (fpm_t): New type representing fpmr values. >>> (enum __ARM_FPM_FORMAT): New enum representing valid fp8 formats. >>> (enum __ARM_FPM_OVERFLOW): New enum representing how some fp8 >>> calculations work. >>> (arm_fpm_init): New. >>> (arm_set_fpm_src1_format): Likewise. >>> (arm_set_fpm_src2_format): Likewise. >>> (arm_set_fpm_dst_format): Likewise. >>> (arm_set_fpm_overflow_cvt): Likewise. >>> (arm_set_fpm_overflow_mul): Likewise. >>> (arm_set_fpm_lscale): Likewise. >>> (arm_set_fpm_lscale2): Likewise. >>> (arm_set_fpm_nscale): Likewise. >> >> According to the FP8 pull request in ACLE: >> https://github.com/ARM-software/acle/pull/323/files >> These functions should all be in the implementation namespace, that is they should begin with __arm*. >> >>> >>> gcc/testsuite/ChangeLog: >>> >>> * gcc.target/aarch64/acle/fp8-helpers.c: New test of fpmr helper functions. >>> --- >>> gcc/config/aarch64/arm_acle.h | 37 +++++++++++++ >>> .../gcc.target/aarch64/acle/fp8-helpers.c | 52 +++++++++++++++++++ >>> 2 files changed, 89 insertions(+) >>> create mode 100644 gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >>> >>> diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h >>> index 2aa681090fa..4f0c600582a 100644 >>> --- a/gcc/config/aarch64/arm_acle.h >>> +++ b/gcc/config/aarch64/arm_acle.h >>> @@ -385,6 +385,43 @@ __rndrrs (uint64_t *__res) >>> >>> #pragma GCC pop_options >>> >>> +#ifdef __ARM_FEATURE_FP8 > > The definitions should be unconditional, so that they're still available > if someone initially compiles without fp8 but enables it later in the > translation unit. > >>> + >>> +typedef uint64_t fpm_t; >>> + >>> +enum __ARM_FPM_FORMAT >>> +{ >>> + __ARM_FPM_E5M2, >>> + __ARM_FPM_E4M3, >>> +}; >>> + >>> +enum __ARM_FPM_OVERFLOW >>> +{ >>> + __ARM_FPM_INFNAN, >>> + __ARM_FPM_SATURATE, >>> +}; >>> + >>> +#define arm_fpm_init() (0) >>> + >>> +#define arm_set_fpm_src1_format(__fpm, __format) \ >>> + ((__fpm & ~(uint64_t)0x7) | (__format & (uint64_t)0x7)) >>> +#define arm_set_fpm_src2_format(__fpm, __format) \ >>> + ((__fpm & ~((uint64_t)0x7 << 3)) | ((__format & (uint64_t)0x7) << 3)) >>> +#define arm_set_fpm_dst_format(__fpm, __format) \ >>> + ((__fpm & ~((uint64_t)0x7 << 6)) | ((__format & (uint64_t)0x7) << 6)) >>> +#define arm_set_fpm_overflow_cvt(__fpm, __behaviour) \ >>> + ((__fpm & ~((uint64_t)0x1 << 15)) | ((__behaviour & (uint64_t)0x1) << 15)) >>> +#define arm_set_fpm_overflow_mul(__fpm, __behaviour) \ >>> + ((__fpm & ~((uint64_t)0x1 << 14)) | ((__behaviour & (uint64_t)0x1) << 14)) > > Nit: should use the international (= US) spelling "behavior". > >>> +#define arm_set_fpm_lscale(__fpm, __scale) \ >>> + ((__fpm & ~((uint64_t)0x7f << 16)) | ((__scale & (uint64_t)0x7f) << 16)) >>> +#define arm_set_fpm_lscale2(__fpm, __scale) \ >>> + ((__fpm & ~((uint64_t)0x3f << 32)) | ((__scale & (uint64_t)0x3f) << 32)) >>> +#define arm_set_fpm_nscale(__fpm, __scale) \ >>> + ((__fpm & ~((uint64_t)0xff << 24)) | ((__scale & (uint64_t)0xff) << 24)) >>> + >> >> I’d feel more comfortable if these were implemented as proper intrinsics in aarch64-builtins.cc <http://aarch64-builtins.cc/> rather than open-coding them like that. For one it would guarantee tighter control over the code generation. >> This version would likely fluctuate a lot based on optimization level, evolution of unrelated passes in the mined and other things. >> Additionally, the ACLE mandates certain ranges for the scale parameter in these intrinsics. Having them as builtins would allow us to add validation code to help the user here. > > These helpers don't map to specific FP8 instructions though. > They're just an abstraction and an aid to the programmer, so that people > don't have to write hard-coded masks or their own shift/insert code. > There's no expectation that a given macro/function will produce a given > code sequence. > > Also, the parameters are allowed to be variable, so I don't think the > ranges are something that the compiler should enforce directly. I see, I guess it makes sense then, I withdraw my objection :) Thanks, Kyrill > > Thanks, > Richard > >> >>> +#endif >>> + >>> #ifdef __cplusplus >>> } >>> #endif >>> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >>> new file mode 100644 >>> index 00000000000..e654326c387 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c >>> @@ -0,0 +1,52 @@ >>> +/* Test the fp8 ACLE helper functions. */ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-std=c90 -pedantic-errors -O1 -march=armv9.4-a+fp8" } */ >>> + >>> +#include <arm_acle.h> >>> + >>> +void >>> +test_prepare_fpmr_sysreg () >>> +{ >>> + >>> +#define _S_EQ(expr, expected) \ >>> + _Static_assert (expr == expected, #expr " == " #expected) >>> + >>> + _S_EQ (arm_fpm_init (), 0); >>> + >>> + /* Bits [2:0] */ >>> + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >>> + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x1); >>> + >>> + /* Bits [5:3] */ >>> + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >>> + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x8); >>> + >>> + /* Bits [8:6] */ >>> + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); >>> + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x40); >>> + >>> + /* Bit 14 */ >>> + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_INFNAN), 0); >>> + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_SATURATE), >>> + 0x4000); >>> + >>> + /* Bit 15 */ >>> + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_INFNAN), 0); >>> + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_SATURATE), >>> + 0x8000); >>> + >>> + /* Bits [22:16] */ >>> + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 0), 0); >>> + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 127), 0x7F0000); >>> + >>> + /* Bits [37:32] */ >>> + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 0), 0); >>> + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 63), 0x3F00000000); >>> + >>> + /* Bits [31:24] */ >>> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 0), 0); >>> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 127), 0x7F000000); >>> + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), -128), 0x80000000); >>> + >> >> Given my comment above, I’d expect to see some code generation tests as well to check that the FPM modification functions above generate tight code. >> >> Thanks, >> Kyrill >> >> >>> +#undef _S_EQ >>> +}
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 2aa681090fa..4f0c600582a 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -385,6 +385,43 @@ __rndrrs (uint64_t *__res) #pragma GCC pop_options +#ifdef __ARM_FEATURE_FP8 + +typedef uint64_t fpm_t; + +enum __ARM_FPM_FORMAT +{ + __ARM_FPM_E5M2, + __ARM_FPM_E4M3, +}; + +enum __ARM_FPM_OVERFLOW +{ + __ARM_FPM_INFNAN, + __ARM_FPM_SATURATE, +}; + +#define arm_fpm_init() (0) + +#define arm_set_fpm_src1_format(__fpm, __format) \ + ((__fpm & ~(uint64_t)0x7) | (__format & (uint64_t)0x7)) +#define arm_set_fpm_src2_format(__fpm, __format) \ + ((__fpm & ~((uint64_t)0x7 << 3)) | ((__format & (uint64_t)0x7) << 3)) +#define arm_set_fpm_dst_format(__fpm, __format) \ + ((__fpm & ~((uint64_t)0x7 << 6)) | ((__format & (uint64_t)0x7) << 6)) +#define arm_set_fpm_overflow_cvt(__fpm, __behaviour) \ + ((__fpm & ~((uint64_t)0x1 << 15)) | ((__behaviour & (uint64_t)0x1) << 15)) +#define arm_set_fpm_overflow_mul(__fpm, __behaviour) \ + ((__fpm & ~((uint64_t)0x1 << 14)) | ((__behaviour & (uint64_t)0x1) << 14)) +#define arm_set_fpm_lscale(__fpm, __scale) \ + ((__fpm & ~((uint64_t)0x7f << 16)) | ((__scale & (uint64_t)0x7f) << 16)) +#define arm_set_fpm_lscale2(__fpm, __scale) \ + ((__fpm & ~((uint64_t)0x3f << 32)) | ((__scale & (uint64_t)0x3f) << 32)) +#define arm_set_fpm_nscale(__fpm, __scale) \ + ((__fpm & ~((uint64_t)0xff << 24)) | ((__scale & (uint64_t)0xff) << 24)) + +#endif + #ifdef __cplusplus } #endif diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c new file mode 100644 index 00000000000..e654326c387 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/acle/fp8-helpers.c @@ -0,0 +1,52 @@ +/* Test the fp8 ACLE helper functions. */ +/* { dg-do compile } */ +/* { dg-options "-std=c90 -pedantic-errors -O1 -march=armv9.4-a+fp8" } */ + +#include <arm_acle.h> + +void +test_prepare_fpmr_sysreg () +{ + +#define _S_EQ(expr, expected) \ + _Static_assert (expr == expected, #expr " == " #expected) + + _S_EQ (arm_fpm_init (), 0); + + /* Bits [2:0] */ + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); + _S_EQ (arm_set_fpm_src1_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x1); + + /* Bits [5:3] */ + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); + _S_EQ (arm_set_fpm_src2_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x8); + + /* Bits [8:6] */ + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E5M2), 0); + _S_EQ (arm_set_fpm_dst_format (arm_fpm_init (), __ARM_FPM_E4M3), 0x40); + + /* Bit 14 */ + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_INFNAN), 0); + _S_EQ (arm_set_fpm_overflow_mul (arm_fpm_init (), __ARM_FPM_SATURATE), + 0x4000); + + /* Bit 15 */ + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_INFNAN), 0); + _S_EQ (arm_set_fpm_overflow_cvt (arm_fpm_init (), __ARM_FPM_SATURATE), + 0x8000); + + /* Bits [22:16] */ + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 0), 0); + _S_EQ (arm_set_fpm_lscale (arm_fpm_init (), 127), 0x7F0000); + + /* Bits [37:32] */ + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 0), 0); + _S_EQ (arm_set_fpm_lscale2 (arm_fpm_init (), 63), 0x3F00000000); + + /* Bits [31:24] */ + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 0), 0); + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), 127), 0x7F000000); + _S_EQ (arm_set_fpm_nscale (arm_fpm_init (), -128), 0x80000000); + +#undef _S_EQ +}