Message ID | 000101cf58af$83900a90$8ab01fb0$@com |
---|---|
State | New |
Headers | show |
On Tue, Apr 15, 2014 at 02:35:06PM +0100, Wilco wrote: > Hi, > > This patch adds a generic implementation of HAVE_RM_CTX using standard fenv calls. As a result math > functions using SET_RESTORE_ROUND* macros do not suffer from a large slowdown on targets which do > not implement optimized libc_fe*_ctx inline functions. Most of the libc_fe* inline functions are now > unused and could be removed in the future (there are a few math functions left which use a mixture > of standard fenv calls and libc_fe* inline functions - they could be updated to use > SET_RESTORE_ROUND or improved to avoid expensive fenv manipulations across just a few FP > instructions). > > libc_feholdsetround*_noex_ctx is added to enable better optimization of SET_RESTORE_ROUND_NOEX* > implementations. > > Performance measurements on ARM and x86 of sin() show significant gains over the current default, > fairly close to a highly optimized fenv_private: > > ARM x86 > no fenv_private : 100% 100% > generic HAVE_RM_CTX : 250% 350% > fenv_private (CTX) : 250% 450% Nice. This looks useful, but I don't know if you should wait for feedback from arch maintainers that would actually be affected by this change. > Wilco > > ChangeLog: > 2014-04-15 Wilco <wdijkstr@arm.com> > > * sysdeps/generic/math_private.h: Add generic HAVE_RM_CTX > implementation. New function (libc_feholdsetround_noex_ctx). > > --- > sysdeps/generic/math_private.h | 116 ++++++++++++++++++++++++++++++++-------- > 1 file changed, 93 insertions(+), 23 deletions(-) > > diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h > index 9b881a3..fade483 100644 > --- a/sysdeps/generic/math_private.h > +++ b/sysdeps/generic/math_private.h > @@ -20,6 +20,7 @@ > #include <stdint.h> > #include <sys/types.h> > #include <fenv.h> > +#include <get-rounding-mode.h> > > /* The original fdlibm code used statements like: > n0 = ((*(int*)&one)>>29)^1; * index of high word * > @@ -557,6 +558,16 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) > block is different from the current state. This saves a lot of time when > the floating point unit is much slower than the fixed point units. */ > > +# ifndef libc_feholdsetround_noex_ctx > +# define libc_feholdsetround_noex_ctx libc_feholdsetround_ctx > +# endif > +# ifndef libc_feholdsetround_noexf_ctx > +# define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx > +# endif > +# ifndef libc_feholdsetround_noexl_ctx > +# define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx > +# endif > + > # ifndef libc_feresetround_noex_ctx > # define libc_feresetround_noex_ctx libc_fesetenv_ctx > # endif > @@ -567,24 +578,80 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) > # define libc_feresetround_noexl_ctx libc_fesetenvl_ctx > # endif > > -# ifndef libc_feholdsetround_53bit_ctx > -# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx > -# endif > +#else > > -# ifndef libc_feresetround_53bit_ctx > -# define libc_feresetround_53bit_ctx libc_feresetround_ctx > -# endif > +/* Default implementation using standard fenv functions. > + Avoid unnecessary rounding mode changes by first checking the > + current rounding mode. Note the use of __glibc_unlikely is > + important for performance. */ > > -# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ > - struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx))); \ > - ROUNDFUNC ## _ctx (&ctx, (RM)) > -#else > -# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \ > - fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC))); \ > - ROUNDFUNC (&__libc_save_rm, (RM)) > +static __always_inline void > +libc_feholdsetround_ctx (struct rm_ctx *ctx, int round) > +{ > + ctx->updated_status = false; > + > + /* Update rounding mode only if different. */ > + if (__glibc_unlikely (round != get_rounding_mode ())) > + { > + ctx->updated_status = true; > + fegetenv (&ctx->env); > + fesetround (round); Shouldn't this be __fegetenv and __fesetround? We shouldn't need to go through the extra dereference since these functions are internal-only. Likewise for all uses below. > + } > +} > + > +static __always_inline void > +libc_feresetround_ctx (struct rm_ctx *ctx) > +{ > + /* Restore the rounding mode if updated. */ > + if (__glibc_unlikely (ctx->updated_status)) > + feupdateenv (&ctx->env); > +} > + > +static __always_inline void > +libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round) > +{ > + /* Save exception flags and rounding mode. */ > + fegetenv (&ctx->env); > + > + /* Update rounding mode only if different. */ > + if (__glibc_unlikely (round != get_rounding_mode ())) > + fesetround (round); > +} > + > +static __always_inline void > +libc_feresetround_noex_ctx (struct rm_ctx *ctx) > +{ > + /* Restore exception flags and rounding mode. */ > + fesetenv (&ctx->env); > +} > + > +# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx > +# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx > +# define libc_feresetroundf_ctx libc_feresetround_ctx > +# define libc_feresetroundl_ctx libc_feresetround_ctx > + > +# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx > +# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx > +# define libc_feresetround_noexf_ctx libc_feresetround_noex_ctx > +# define libc_feresetround_noexl_ctx libc_feresetround_noex_ctx > + > +#endif > + > +#ifndef libc_feholdsetround_53bit_ctx > +# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx > #endif > +#ifndef libc_feresetround_53bit_ctx > +# define libc_feresetround_53bit_ctx libc_feresetround_ctx > +#endif > + > +#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ > + struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \ > + ROUNDFUNC ## _ctx (&ctx, (RM)) > > -/* Save and restore the rounding mode within a lexical block. */ > +/* Set the rounding mode within a lexical block. Restore the rounding mode to > + the value at the start of the block. The exception mode must be preserved. > + Exceptions raised within the block must be set in the exception flags. > + Non-stop mode may be enabled inside the block. */ > > #define SET_RESTORE_ROUND(RM) \ > SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround) > @@ -593,15 +660,18 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) > #define SET_RESTORE_ROUNDL(RM) \ > SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl) > > -/* Save and restore the rounding mode within a lexical block, and also > - the set of exceptions raised within the block may be discarded. */ > - > -#define SET_RESTORE_ROUND_NOEX(RM) \ > - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex) > -#define SET_RESTORE_ROUND_NOEXF(RM) \ > - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf) > -#define SET_RESTORE_ROUND_NOEXL(RM) \ > - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl) > +/* Set the rounding mode within a lexical block. Restore the rounding mode to > + the value at the start of the block. The exception mode must be preserved. > + Exceptions raised within the block must be discarded, and exception flags > + are restored to the value at the start of the block. > + Non-stop mode may be enabled inside the block. */ > + > +#define SET_RESTORE_ROUND_NOEX(RM) SET_RESTORE_ROUND_GENERIC (RM, \ > + libc_feholdsetround_noex, libc_feresetround_noex) > +#define SET_RESTORE_ROUND_NOEXF(RM) SET_RESTORE_ROUND_GENERIC (RM, \ > + libc_feholdsetround_noexf, libc_feresetround_noexf) > +#define SET_RESTORE_ROUND_NOEXL(RM) SET_RESTORE_ROUND_GENERIC (RM, \ > + libc_feholdsetround_noexl, libc_feresetround_noexl) Minor bikeshedding: the following formatting would look nicer: #define SET_RESTORE_ROUND_NOEX(RM) \ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_noex, \ libc_feresetround_noex) > /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits. */ > #define SET_RESTORE_ROUND_53BIT(RM) \ > -- > 1.7.9.5 > > > > >
> From: Siddhesh Poyarekar [mailto:siddhesh@redhat.com] > On Tue, Apr 15, 2014 at 02:35:06PM +0100, Wilco wrote: > > Hi, > > > > This patch adds a generic implementation of HAVE_RM_CTX using standard fenv calls. As a > result math > > functions using SET_RESTORE_ROUND* macros do not suffer from a large slowdown on targets > which do > > not implement optimized libc_fe*_ctx inline functions. Most of the libc_fe* inline functions > are now > > unused and could be removed in the future (there are a few math functions left which use a > mixture > > of standard fenv calls and libc_fe* inline functions - they could be updated to use > > SET_RESTORE_ROUND or improved to avoid expensive fenv manipulations across just a few FP > > instructions). > > > > libc_feholdsetround*_noex_ctx is added to enable better optimization of > SET_RESTORE_ROUND_NOEX* > > implementations. > > > > Performance measurements on ARM and x86 of sin() show significant gains over the current > default, > > fairly close to a highly optimized fenv_private: > > > > ARM x86 > > no fenv_private : 100% 100% > > generic HAVE_RM_CTX : 250% 350% > > fenv_private (CTX) : 250% 450% > > Nice. This looks useful, but I don't know if you should wait for > feedback from arch maintainers that would actually be affected by this > change. Would anyone mind a huge speedup? Basically I see this as a first step towards cleaning up math_private.h and the way math functions use fenv. Ideally the default implementation should be efficient across all targets, and it looks like this is easy to achieve. If the rounding mode changes had been done like this when originally added, targets wouldn't have experienced major slowdowns of common math functions. Wilco
On Wed, Apr 30, 2014 at 02:39:36PM +0100, Wilco wrote: > Would anyone mind a huge speedup? They wouldn't, but I haven't done an exhaustive code check across architectures (or testing for that matter) to confidently assert that the change would definitely be welcome by all. That is why I asked for another opinion from another maintainer on this. > Basically I see this as a first step towards cleaning up > math_private.h and the way math functions use fenv. Ideally the > default implementation should be efficient across all targets, and > it looks like this is easy to achieve. Agreed and if you can get another maintainer to agree, then I think this patch is good to go with the changes I suggested earlier. Siddhesh
> Siddhesh Poyarekar wrote: > On Wed, Apr 30, 2014 at 02:39:36PM +0100, Wilco wrote: > > Would anyone mind a huge speedup? > > They wouldn't, but I haven't done an exhaustive code check across > architectures (or testing for that matter) to confidently assert that > the change would definitely be welcome by all. That is why I asked > for another opinion from another maintainer on this. Marcus (AArch64) already reviewed it. > > Basically I see this as a first step towards cleaning up > > math_private.h and the way math functions use fenv. Ideally the > > default implementation should be efficient across all targets, and > > it looks like this is easy to achieve. > > Agreed and if you can get another maintainer to agree, then I think > this patch is good to go with the changes I suggested earlier. As for your other comment: > Shouldn't this be __fegetenv and __fesetround? We shouldn't need to > go through the extra dereference since these functions are > internal-only. Likewise for all uses below. I agree we should avoid PLTs for internal calls. However I'm certain most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver is used inconsistently, so this is yet another area that needs a cleanup. Solving that wasn't the purpose of my patch - in fact it is consistent with the rest of math_private.h. Btw can we agree on what the rules are? Eg. * Every exported symbol should have a libm_hidden_ver. * All internal calls must use the hidden symbol. I'll improve the formatting of SET_RESTORE_ROUND_NOEX*. Cheers, Wilco
On 12 May 2014 17:32, Wilco <wdijkstr@arm.com> wrote: > Marcus (AArch64) already reviewed it. I don't see his review on this thread; is it on another thread? > I agree we should avoid PLTs for internal calls. However I'm certain > most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver > is used inconsistently, so this is yet another area that needs a cleanup. Fair enough. We can look at it as a separate cleanup then. > Solving that wasn't the purpose of my patch - in fact it is consistent > with the rest of math_private.h. > > Btw can we agree on what the rules are? Eg. > > * Every exported symbol should have a libm_hidden_ver. > * All internal calls must use the hidden symbol. An exported symbol should have a hidden alias if it is called internally (and internal calls should use that hidden symbol), to avoid the extra PLT dereference in the internal call. So it is not necessary to define a hidden alias for *every* exported symbol. Siddhesh
On Mon, 12 May 2014, Wilco wrote: > > Shouldn't this be __fegetenv and __fesetround? We shouldn't need to > > go through the extra dereference since these functions are > > internal-only. Likewise for all uses below. > > I agree we should avoid PLTs for internal calls. However I'm certain > most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver > is used inconsistently, so this is yet another area that needs a cleanup. As I said in <https://sourceware.org/ml/libc-alpha/2014-04/msg00471.html>, PLT avoidance (via hidden_def etc.) is a separate matter from namespace-cleanness (via __-prefixed names). I think you can assume that for the symbols with libm_hidden_proto in include/fenv.h, all architectures use libm_hidden_def (otherwise they'd have had obvious build failures, at least if the symbol is in fact used internally). *But*, if you want to use an __-prefixed name, you need to make sure all architectures do in fact define that name and then use a weak alias for the public name, as they may well not be consistent in that regard. And it may still be useful to declare that __-prefixed name as hidden (i.e. use attribute_hidden on the prototype) or use hidden_proto / hidden_def with it; I'm not sure how important any compile-time optimizations this enables are, but it does allow for such optimizations at compile time rather than just at link time. Namespace-cleanness is an issue here when the functions get used from C90 libm functions, since C90 doesn't include the <fenv.h> functions (I expect it's also a pre-existing bug there and so any fix should include a bug report in Bugzilla, etc.) - C90 functions shouldn't use the fe* names (or a reserved name that brings in a non-reserved name as a *strong* alias). It's not relevant for calls from non-C90 functions (e.g. those for float and long double) except where in some other standard that lacks the <fenv.h> functions (e.g. Unix98 has some of the C99 functions for double). But in practice it would be fragile to rely on which math_private / fenv_private facilities get called from which libm functions, so I think systematically using the __-* names should be the eventual goal.
> Siddhesh Poyarekar wrote: > On 12 May 2014 17:32, Wilco <wdijkstr@arm.com> wrote: > > Marcus (AArch64) already reviewed it. > > I don't see his review on this thread; is it on another thread? This was an internal review before the patch goes public. > > I agree we should avoid PLTs for internal calls. However I'm certain > > most targets wouldn't build if I used __fegetenv etc. libm_hidden_ver > > is used inconsistently, so this is yet another area that needs a cleanup. > > Fair enough. We can look at it as a separate cleanup then. > > > Solving that wasn't the purpose of my patch - in fact it is consistent > > with the rest of math_private.h. > > > > Btw can we agree on what the rules are? Eg. > > > > * Every exported symbol should have a libm_hidden_ver. > > * All internal calls must use the hidden symbol. > > An exported symbol should have a hidden alias if it is called > internally (and internal calls should use that hidden symbol), to > avoid the extra PLT dereference in the internal call. So it is not > necessary to define a hidden alias for *every* exported symbol. It may not be necessary but it would be the only way to avoid inconsistencies. Note it turns out the fenv functions called by math_private.h already use direct calls. So libm_hidden_def appears to be sufficient to bypass PLTs, and the libm_hidden_ver is only required to avoid the namespace issue Joseph mentioned. I've attatched the new version. Besides the formatting I blocked the annoying undefined HAVE_RM_CTX warnings. Cheers, Wilco
diff --git a/sysdeps/generic/math_private.h b/sysdeps/generic/math_private.h index 9b881a3..fade483 100644 --- a/sysdeps/generic/math_private.h +++ b/sysdeps/generic/math_private.h @@ -20,6 +20,7 @@ #include <stdint.h> #include <sys/types.h> #include <fenv.h> +#include <get-rounding-mode.h> /* The original fdlibm code used statements like: n0 = ((*(int*)&one)>>29)^1; * index of high word * @@ -557,6 +558,16 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) block is different from the current state. This saves a lot of time when the floating point unit is much slower than the fixed point units. */ +# ifndef libc_feholdsetround_noex_ctx +# define libc_feholdsetround_noex_ctx libc_feholdsetround_ctx +# endif +# ifndef libc_feholdsetround_noexf_ctx +# define libc_feholdsetround_noexf_ctx libc_feholdsetroundf_ctx +# endif +# ifndef libc_feholdsetround_noexl_ctx +# define libc_feholdsetround_noexl_ctx libc_feholdsetroundl_ctx +# endif + # ifndef libc_feresetround_noex_ctx # define libc_feresetround_noex_ctx libc_fesetenv_ctx # endif @@ -567,24 +578,80 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) # define libc_feresetround_noexl_ctx libc_fesetenvl_ctx # endif -# ifndef libc_feholdsetround_53bit_ctx -# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx -# endif +#else -# ifndef libc_feresetround_53bit_ctx -# define libc_feresetround_53bit_ctx libc_feresetround_ctx -# endif +/* Default implementation using standard fenv functions. + Avoid unnecessary rounding mode changes by first checking the + current rounding mode. Note the use of __glibc_unlikely is + important for performance. */ -# define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ - struct rm_ctx ctx __attribute__((cleanup(CLEANUPFUNC ## _ctx))); \ - ROUNDFUNC ## _ctx (&ctx, (RM)) -#else -# define SET_RESTORE_ROUND_GENERIC(RM, ROUNDFUNC, CLEANUPFUNC) \ - fenv_t __libc_save_rm __attribute__((cleanup(CLEANUPFUNC))); \ - ROUNDFUNC (&__libc_save_rm, (RM)) +static __always_inline void +libc_feholdsetround_ctx (struct rm_ctx *ctx, int round) +{ + ctx->updated_status = false; + + /* Update rounding mode only if different. */ + if (__glibc_unlikely (round != get_rounding_mode ())) + { + ctx->updated_status = true; + fegetenv (&ctx->env); + fesetround (round); + } +} + +static __always_inline void +libc_feresetround_ctx (struct rm_ctx *ctx) +{ + /* Restore the rounding mode if updated. */ + if (__glibc_unlikely (ctx->updated_status)) + feupdateenv (&ctx->env); +} + +static __always_inline void +libc_feholdsetround_noex_ctx (struct rm_ctx *ctx, int round) +{ + /* Save exception flags and rounding mode. */ + fegetenv (&ctx->env); + + /* Update rounding mode only if different. */ + if (__glibc_unlikely (round != get_rounding_mode ())) + fesetround (round); +} + +static __always_inline void +libc_feresetround_noex_ctx (struct rm_ctx *ctx) +{ + /* Restore exception flags and rounding mode. */ + fesetenv (&ctx->env); +} + +# define libc_feholdsetroundf_ctx libc_feholdsetround_ctx +# define libc_feholdsetroundl_ctx libc_feholdsetround_ctx +# define libc_feresetroundf_ctx libc_feresetround_ctx +# define libc_feresetroundl_ctx libc_feresetround_ctx + +# define libc_feholdsetround_noexf_ctx libc_feholdsetround_noex_ctx +# define libc_feholdsetround_noexl_ctx libc_feholdsetround_noex_ctx +# define libc_feresetround_noexf_ctx libc_feresetround_noex_ctx +# define libc_feresetround_noexl_ctx libc_feresetround_noex_ctx + +#endif + +#ifndef libc_feholdsetround_53bit_ctx +# define libc_feholdsetround_53bit_ctx libc_feholdsetround_ctx #endif +#ifndef libc_feresetround_53bit_ctx +# define libc_feresetround_53bit_ctx libc_feresetround_ctx +#endif + +#define SET_RESTORE_ROUND_GENERIC(RM,ROUNDFUNC,CLEANUPFUNC) \ + struct rm_ctx ctx __attribute__((cleanup (CLEANUPFUNC ## _ctx))); \ + ROUNDFUNC ## _ctx (&ctx, (RM)) -/* Save and restore the rounding mode within a lexical block. */ +/* Set the rounding mode within a lexical block. Restore the rounding mode to + the value at the start of the block. The exception mode must be preserved. + Exceptions raised within the block must be set in the exception flags. + Non-stop mode may be enabled inside the block. */ #define SET_RESTORE_ROUND(RM) \ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround) @@ -593,15 +660,18 @@ default_libc_feupdateenv_test (fenv_t *e, int ex) #define SET_RESTORE_ROUNDL(RM) \ SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetroundl) -/* Save and restore the rounding mode within a lexical block, and also - the set of exceptions raised within the block may be discarded. */ - -#define SET_RESTORE_ROUND_NOEX(RM) \ - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround, libc_feresetround_noex) -#define SET_RESTORE_ROUND_NOEXF(RM) \ - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundf, libc_feresetround_noexf) -#define SET_RESTORE_ROUND_NOEXL(RM) \ - SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetroundl, libc_feresetround_noexl) +/* Set the rounding mode within a lexical block. Restore the rounding mode to + the value at the start of the block. The exception mode must be preserved. + Exceptions raised within the block must be discarded, and exception flags + are restored to the value at the start of the block. + Non-stop mode may be enabled inside the block. */ + +#define SET_RESTORE_ROUND_NOEX(RM) SET_RESTORE_ROUND_GENERIC (RM, \ + libc_feholdsetround_noex, libc_feresetround_noex) +#define SET_RESTORE_ROUND_NOEXF(RM) SET_RESTORE_ROUND_GENERIC (RM, \ + libc_feholdsetround_noexf, libc_feresetround_noexf) +#define SET_RESTORE_ROUND_NOEXL(RM) SET_RESTORE_ROUND_GENERIC (RM, \ + libc_feholdsetround_noexl, libc_feresetround_noexl) /* Like SET_RESTORE_ROUND, but also set rounding precision to 53 bits. */ #define SET_RESTORE_ROUND_53BIT(RM) \