diff mbox series

replace tgammaf by the CORE-MATH implementation (correctly rounded)

Message ID 20240729113753.1527107-1-Paul.Zimmermann@inria.fr
State New
Headers show
Series replace tgammaf by the CORE-MATH implementation (correctly rounded) | expand

Commit Message

Paul Zimmermann July 29, 2024, 11:37 a.m. UTC
The CORE-MATH implementation is correctly rounded (for any rounding mode).
This can be checked by exhaustive tests in a few minutes since there are
less than 2^32 values to check (against GNU MPFR for example).

This patch also adds benchtest values for tgammaf. With the initial
GNU libc code it gave:

  "tgammaf": {
   "": {
    "duration": 3.50188e+09,
    "iterations": 2e+07,
    "max": 602.891,
    "min": 65.1415,
    "mean": 175.094
   }
  }

With the new code:

  "tgammaf": {
   "": {
    "duration": 3.27888e+09,
    "iterations": 4.8e+07,
    "max": 232.661,
    "min": 29.932,
    "mean": 68.3099
   }
  }
---
 benchtests/Makefile                           |   1 +
 .../strcoll-inputs/filelist#en_US.UTF-8       |   1 -
 math/w_tgammaf_compat.c                       |  52 ++-
 sysdeps/ieee754/flt-32/e_gammaf_r.c           | 307 +++++++-----------
 sysdeps/x86_64/fpu/libm-test-ulps             |   4 -
 5 files changed, 136 insertions(+), 229 deletions(-)

Comments

Vincent Lefevre July 29, 2024, 12:30 p.m. UTC | #1
On 2024-07-29 13:37:45 +0200, Paul Zimmermann wrote:
> +	return __builtin_nanf("12");

(twice in the code)

Is this standard?
Adhemerval Zanella Netto July 29, 2024, 1:16 p.m. UTC | #2
On 29/07/24 08:37, Paul Zimmermann wrote:
> The CORE-MATH implementation is correctly rounded (for any rounding mode).
> This can be checked by exhaustive tests in a few minutes since there are
> less than 2^32 values to check (against GNU MPFR for example).
> 
> This patch also adds benchtest values for tgammaf. With the initial
> GNU libc code it gave:
> 
>   "tgammaf": {
>    "": {
>     "duration": 3.50188e+09,
>     "iterations": 2e+07,
>     "max": 602.891,
>     "min": 65.1415,
>     "mean": 175.094
>    }
>   }
> 
> With the new code:
> 
>   "tgammaf": {
>    "": {
>     "duration": 3.27888e+09,
>     "iterations": 4.8e+07,
>     "max": 232.661,
>     "min": 29.932,
>     "mean": 68.3099
>    }
>   }

Hi Paul,

Thanks for working on this, however it seems there is real regressions on
x86 32 bit bot [1].

Also, for this license we will need a Sign-off by you (or by the original
author) and I think this license should be compatible to be accepted.
Carlos said he will check this out as well.

[1] https://www.delorie.com/trybots/32bit/36770/math-test-float-tgamma.out

> ---
>  benchtests/Makefile                           |   1 +
>  .../strcoll-inputs/filelist#en_US.UTF-8       |   1 -
>  math/w_tgammaf_compat.c                       |  52 ++-
>  sysdeps/ieee754/flt-32/e_gammaf_r.c           | 307 +++++++-----------
>  sysdeps/x86_64/fpu/libm-test-ulps             |   4 -
>  5 files changed, 136 insertions(+), 229 deletions(-)
> 
> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 382fb7bae1..265ad34d8d 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -94,6 +94,7 @@ bench-math := \
>    tan \
>    tanh \
>    tgamma \
> +  tgammaf \
>    trunc \
>    truncf \
>    y0 \
> diff --git a/benchtests/strcoll-inputs/filelist#en_US.UTF-8 b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> index 0d8f1c722b..93142aed97 100644
> --- a/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> +++ b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> @@ -5315,7 +5315,6 @@ s_isinf.c
>  dbl2mpn.c
>  atnat.h
>  flt-32
> -e_gammaf_r.c
>  e_remainderf.c
>  s_llroundf.c
>  s_erff.c
> diff --git a/math/w_tgammaf_compat.c b/math/w_tgammaf_compat.c
> index 34e0e096e0..b5208c54d5 100644
> --- a/math/w_tgammaf_compat.c
> +++ b/math/w_tgammaf_compat.c
> @@ -2,14 +2,28 @@
>   */
>  
>  /*
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> +Copyright (c) 2023 Alexei Sibidanov.
> +
> +This file was copied from the CORE-MATH project
> +(file src/binary32/tgamma/tgammaf.c, revision 673c2af)
> +
> +Permission is hereby granted, free of charge, to any person obtaining a copy
> +of this software and associated documentation files (the "Software"), to deal
> +in the Software without restriction, including without limitation the rights
> +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> +copies of the Software, and to permit persons to whom the Software is
> +furnished to do so, subject to the following conditions:
> +
> +The above copyright notice and this permission notice shall be included in all
> +copies or substantial portions of the Software.
> +
> +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> +SOFTWARE.
>   */
>  
>  #include <errno.h>
> @@ -22,26 +36,8 @@
>  float
>  __tgammaf(float x)
>  {
> -	int local_signgam;
> -	float y = __ieee754_gammaf_r(x,&local_signgam);
> -
> -	if(__glibc_unlikely (!isfinite (y) || y == 0)
> -	   && (isfinite (x) || (isinf (x) && x < 0.0))
> -	   && _LIB_VERSION != _IEEE_) {
> -	  if (x == (float)0.0)
> -	    /* tgammaf pole */
> -	    return __kernel_standard_f(x, x, 150);
> -	  else if(floorf(x)==x&&x<0.0f)
> -	    /* tgammaf domain */
> -	    return __kernel_standard_f(x, x, 141);
> -	  else if (y == 0)
> -	    /* tgammaf underflow */
> -	    __set_errno (ERANGE);
> -	  else
> -	    /* tgammaf overflow */
> -	    return __kernel_standard_f(x, x, 140);
> -	}
> -	return local_signgam < 0 ? - y : y;
> +  int e;
> +  return __ieee754_gammaf_r (x, &e);
>  }
>  libm_alias_float (__tgamma, tgamma)
>  #endif
> diff --git a/sysdeps/ieee754/flt-32/e_gammaf_r.c b/sysdeps/ieee754/flt-32/e_gammaf_r.c
> index a9730d61c1..9df66c7757 100644
> --- a/sysdeps/ieee754/flt-32/e_gammaf_r.c
> +++ b/sysdeps/ieee754/flt-32/e_gammaf_r.c
> @@ -1,215 +1,130 @@
> -/* Implementation of gamma function according to ISO C.
> -   Copyright (C) 1997-2024 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> +/* Implementation of the gamma function for binary32.
>  
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> +Copyright (c) 2023-2024 Alexei Sibidanov.
>  
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> +This file was copied from the CORE-MATH project
> +(file src/binary32/tgamma/tgammaf.c, revision a48e352)
>  
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> +Permission is hereby granted, free of charge, to any person obtaining a copy
> +of this software and associated documentation files (the "Software"), to deal
> +in the Software without restriction, including without limitation the rights
> +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> +copies of the Software, and to permit persons to whom the Software is
> +furnished to do so, subject to the following conditions:
>  
> -#include <math.h>
> -#include <math-narrow-eval.h>
> -#include <math_private.h>
> -#include <fenv_private.h>
> -#include <math-underflow.h>
> -#include <float.h>
> -#include <libm-alias-finite.h>
> -
> -/* Coefficients B_2k / 2k(2k-1) of x^-(2k-1) inside exp in Stirling's
> -   approximation to gamma function.  */
> -
> -static const float gamma_coeff[] =
> -  {
> -    0x1.555556p-4f,
> -    -0xb.60b61p-12f,
> -    0x3.403404p-12f,
> -  };
> +The above copyright notice and this permission notice shall be included in all
> +copies or substantial portions of the Software.
>  
> -#define NCOEFF (sizeof (gamma_coeff) / sizeof (gamma_coeff[0]))
> +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> +SOFTWARE.
> + */
>  
> -/* Return gamma (X), for positive X less than 42, in the form R *
> -   2^(*EXP2_ADJ), where R is the return value and *EXP2_ADJ is set to
> -   avoid overflow or underflow in intermediate calculations.  */
> +#include <stdint.h>
> +#include <errno.h>
> +#include <libm-alias-finite.h>
>  
> -static float
> -gammaf_positive (float x, int *exp2_adj)
> -{
> -  int local_signgam;
> -  if (x < 0.5f)
> -    {
> -      *exp2_adj = 0;
> -      return __ieee754_expf (__ieee754_lgammaf_r (x + 1, &local_signgam)) / x;
> -    }
> -  else if (x <= 1.5f)
> -    {
> -      *exp2_adj = 0;
> -      return __ieee754_expf (__ieee754_lgammaf_r (x, &local_signgam));
> -    }
> -  else if (x < 2.5f)
> -    {
> -      *exp2_adj = 0;
> -      float x_adj = x - 1;
> -      return (__ieee754_expf (__ieee754_lgammaf_r (x_adj, &local_signgam))
> -	      * x_adj);
> -    }
> -  else
> -    {
> -      float eps = 0;
> -      float x_eps = 0;
> -      float x_adj = x;
> -      float prod = 1;
> -      if (x < 4.0f)
> -	{
> -	  /* Adjust into the range for applying Stirling's
> -	     approximation.  */
> -	  float n = ceilf (4.0f - x);
> -	  x_adj = math_narrow_eval (x + n);
> -	  x_eps = (x - (x_adj - n));
> -	  prod = __gamma_productf (x_adj - n, x_eps, n, &eps);
> -	}
> -      /* The result is now gamma (X_ADJ + X_EPS) / (PROD * (1 + EPS)).
> -	 Compute gamma (X_ADJ + X_EPS) using Stirling's approximation,
> -	 starting by computing pow (X_ADJ, X_ADJ) with a power of 2
> -	 factored out.  */
> -      float exp_adj = -eps;
> -      float x_adj_int = roundf (x_adj);
> -      float x_adj_frac = x_adj - x_adj_int;
> -      int x_adj_log2;
> -      float x_adj_mant = __frexpf (x_adj, &x_adj_log2);
> -      if (x_adj_mant < M_SQRT1_2f)
> -	{
> -	  x_adj_log2--;
> -	  x_adj_mant *= 2.0f;
> -	}
> -      *exp2_adj = x_adj_log2 * (int) x_adj_int;
> -      float ret = (__ieee754_powf (x_adj_mant, x_adj)
> -		   * __ieee754_exp2f (x_adj_log2 * x_adj_frac)
> -		   * __ieee754_expf (-x_adj)
> -		   * sqrtf (2 * M_PIf / x_adj)
> -		   / prod);
> -      exp_adj += x_eps * __ieee754_logf (x_adj);
> -      float bsum = gamma_coeff[NCOEFF - 1];
> -      float x_adj2 = x_adj * x_adj;
> -      for (size_t i = 1; i <= NCOEFF - 1; i++)
> -	bsum = bsum / x_adj2 + gamma_coeff[NCOEFF - 1 - i];
> -      exp_adj += bsum / x_adj;
> -      return ret + ret * __expm1f (exp_adj);
> -    }
> -}
> +typedef union {float f; uint32_t u;} b32u32_u;
> +typedef union {double f; uint64_t u;} b64u64_u;
>  
>  float
> -__ieee754_gammaf_r (float x, int *signgamp)
> +__ieee754_gammaf_r (float x, int *exp2_adj)
>  {
> -  int32_t hx;
> -  float ret;
> -
> -  GET_FLOAT_WORD (hx, x);
> +  static const struct {b32u32_u x; float f, df;} tb[] = {
> +    {{.u = 0x27de86a9u}, 0x1.268266p+47f, 0x1p22f},
> +    {{.u = 0x27e05475u}, 0x1.242422p+47f, 0x1p22f},
> +    {{.u = 0xb63befb3u}, -0x1.5cb6e4p+18f, 0x1p-7f},
> +    {{.u = 0x3c7bb570u}, 0x1.021d9p+6f, 0x1p-19f},
> +    {{.u = 0x41e886d1u}, 0x1.33136ap+98f, 0x1p73f},
> +    {{.u = 0xc067d177u}, 0x1.f6850cp-3f, 0x1p-28f},
> +    {{.f = -0x1.33b462p-4}, -0x1.befe66p+3, -0x1p-22f},
> +    {{.f = -0x1.a988b4p-1}, -0x1.a6b4ecp+2, +0x1p-23f},
> +    {{.f = 0x1.dceffcp+4}, 0x1.d3631cp+101, -0x1p-76f},
> +    {{.f = 0x1.0874c8p+0}, 0x1.f6c638p-1, 0x1p-26f},
> +  };
>  
> -  if (__glibc_unlikely ((hx & 0x7fffffff) == 0))
> -    {
> -      /* Return value for x == 0 is Inf with divide by zero exception.  */
> -      *signgamp = 0;
> -      return 1.0 / x;
> +  b32u32_u t = {.f = x};
> +  uint32_t ax = t.u<<1;
> +  if(__builtin_expect(ax>=(0xffu<<24), 0)){
> +    if(ax==(0xffu<<24)){
> +      if(t.u>>31){
> +	errno = EDOM;
> +	return __builtin_nanf("12");
> +      }
> +      return x;
>      }
> -  if (__builtin_expect (hx < 0, 0)
> -      && (uint32_t) hx < 0xff800000 && rintf (x) == x)
> -    {
> -      /* Return value for integer x < 0 is NaN with invalid exception.  */
> -      *signgamp = 0;
> -      return (x - x) / (x - x);
> +    return x; // nan
> +  }
> +  double z = x;
> +  if(__builtin_expect(ax<0x6d000000u, 0)){
> +    volatile double d = (0x1.fa658c23b1578p-1 - 0x1.d0a118f324b63p-1*z)*z - 0x1.2788cfc6fb619p-1;
> +    double f = 1.0/z + d;
> +    float r = f;
> +    if(__builtin_fabs(r)>0x1.fffffep+127f) errno = ERANGE;
> +    b64u64_u rt = {.f = f};
> +    if(((rt.u+2)&0xfffffff) < 4){
> +      for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++)
> +	if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
>      }
> -  if (__glibc_unlikely (hx == 0xff800000))
> -    {
> -      /* x == -Inf.  According to ISO this is NaN.  */
> -      *signgamp = 0;
> -      return x - x;
> +    return r;
> +  }
> +  float fx = __builtin_floor(x);
> +  int k = fx;
> +  if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
> +    float r = 0x1p127f * 0x1p127f;
> +    if(r>0x1.fffffep+127) errno = ERANGE;
> +    return r;
> +  }
> +  if(__builtin_expect(fx==x, 0)){
> +    if(x == 0.0f){
> +      errno = ERANGE;
> +      return 1.0f/x;
>      }
> -  if (__glibc_unlikely ((hx & 0x7f800000) == 0x7f800000))
> -    {
> -      /* Positive infinity (return positive infinity) or NaN (return
> -	 NaN).  */
> -      *signgamp = 0;
> -      return x + x;
> +    if(x < 0.0f) {
> +      errno = EDOM;
> +      return __builtin_nanf("12");
>      }
> +    double t0 = 1, x0 = 1;
> +    for(int i=1; i<k; i++, x0 += 1.0) t0 *= x0;
> +    return t0;
> +  }
> +  if(__builtin_expect(x<-47.0f, 0)){
> +    static const float sgn[2] = {0x1p-127, -0x1p-127};
> +    float r = 0x1p-127f * sgn[k&1];
> +    if(r == 0.0f) errno = ERANGE;
> +    return r;
> +  }
> +  static const double c[] =
> +    {0x1.c9a76be577123p+0, 0x1.8f2754ddcf90dp+0, 0x1.0d1191949419bp+0, 0x1.e1f42cf0ae4a1p-2,
> +     0x1.82b358a3ab638p-3, 0x1.e1f2b30cd907bp-5, 0x1.240f6d4071bd8p-6, 0x1.1522c9f3cd012p-8,
> +     0x1.1fd0051a0525bp-10, 0x1.9808a8b96c37ep-13, 0x1.b3f78e01152b5p-15, 0x1.49c85a7e1fd04p-18,
> +     0x1.471ca49184475p-19, -0x1.368f0b7ed9e36p-23, 0x1.882222f9049efp-23, -0x1.a69ed2042842cp-25};
>  
> -  if (x >= 36.0f)
> -    {
> -      /* Overflow.  */
> -      *signgamp = 0;
> -      ret = math_narrow_eval (FLT_MAX * FLT_MAX);
> -      return ret;
> -    }
> -  else
> -    {
> -      SET_RESTORE_ROUNDF (FE_TONEAREST);
> -      if (x > 0.0f)
> -	{
> -	  *signgamp = 0;
> -	  int exp2_adj;
> -	  float tret = gammaf_positive (x, &exp2_adj);
> -	  ret = __scalbnf (tret, exp2_adj);
> -	}
> -      else if (x >= -FLT_EPSILON / 4.0f)
> -	{
> -	  *signgamp = 0;
> -	  ret = 1.0f / x;
> -	}
> -      else
> -	{
> -	  float tx = truncf (x);
> -	  *signgamp = (tx == 2.0f * truncf (tx / 2.0f)) ? -1 : 1;
> -	  if (x <= -42.0f)
> -	    /* Underflow.  */
> -	    ret = FLT_MIN * FLT_MIN;
> -	  else
> -	    {
> -	      float frac = tx - x;
> -	      if (frac > 0.5f)
> -		frac = 1.0f - frac;
> -	      float sinpix = (frac <= 0.25f
> -			      ? __sinf (M_PIf * frac)
> -			      : __cosf (M_PIf * (0.5f - frac)));
> -	      int exp2_adj;
> -	      float tret = M_PIf / (-x * sinpix
> -				    * gammaf_positive (-x, &exp2_adj));
> -	      ret = __scalbnf (tret, -exp2_adj);
> -	      math_check_force_underflow_nonneg (ret);
> -	    }
> -	}
> -      ret = math_narrow_eval (ret);
> -    }
> -  if (isinf (ret) && x != 0)
> -    {
> -      if (*signgamp < 0)
> -	{
> -	  ret = math_narrow_eval (-copysignf (FLT_MAX, ret) * FLT_MAX);
> -	  ret = -ret;
> -	}
> -      else
> -	ret = math_narrow_eval (copysignf (FLT_MAX, ret) * FLT_MAX);
> -      return ret;
> -    }
> -  else if (ret == 0)
> -    {
> -      if (*signgamp < 0)
> -	{
> -	  ret = math_narrow_eval (-copysignf (FLT_MIN, ret) * FLT_MIN);
> -	  ret = -ret;
> -	}
> -      else
> -	ret = math_narrow_eval (copysignf (FLT_MIN, ret) * FLT_MIN);
> -      return ret;
> +  double m = z - 0x1.7p+1, i = __builtin_roundeven(m), step = __builtin_copysign(1.0,i);
> +  double d = m - i, d2 = d*d, d4 = d2*d2, d8 = d4*d4;
> +  double f = (c[0] + d*c[1]) + d2*(c[2] + d*c[3]) + d4*((c[4] + d*c[5]) + d2*(c[6] + d*c[7]))
> +    + d8*((c[8] + d*c[9]) + d2*(c[10] + d*c[11]) + d4*((c[12] + d*c[13]) + d2*(c[14] + d*c[15])));
> +  int jm = __builtin_fabs(i);
> +  double w = 1;
> +  if(jm){
> +    z -= 0.5 + step*0.5;
> +    w = z;
> +    for(int j=jm-1; j; j--) {z -= step; w *= z;}
> +  }
> +  if(i<=-0.5) w = 1/w;
> +  f *= w;
> +  b64u64_u rt = {.f = f};
> +  float r = f;
> +  if(__builtin_expect(r==0.0f, 0)) errno = ERANGE;
> +  if(__builtin_expect(((rt.u+2)&0xfffffff) < 8, 0)){
> +    for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++) {
> +      if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
>      }
> -  else
> -    return ret;
> +  }
> +  return r;
>  }
>  libm_alias_finite (__ieee754_gammaf_r, __gammaf_r)
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 37d8998c71..29e3ca926a 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -2263,25 +2263,21 @@ double: 1
>  
>  Function: "tgamma":
>  double: 9
> -float: 8
>  float128: 4
>  ldouble: 5
>  
>  Function: "tgamma_downward":
>  double: 9
> -float: 7
>  float128: 5
>  ldouble: 6
>  
>  Function: "tgamma_towardzero":
>  double: 9
> -float: 7
>  float128: 5
>  ldouble: 6
>  
>  Function: "tgamma_upward":
>  double: 9
> -float: 8
>  float128: 4
>  ldouble: 5
>
Paul Zimmermann July 29, 2024, 3:04 p.m. UTC | #3
Hi Adhemerval,

thank you for your review.

> Thanks for working on this, however it seems there is real regressions on
> x86 32 bit bot [1].

looking at this:

Failure: Test: tgamma_towardzero (0x2.30a44p+4)
Result:
 is:          inf   inf
 should be:   3.40282346e+38   0x1.fffffep+127

I don't understand how this can be possible. The code has:

 if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
    float r = 0x1p127f * 0x1p127f;
    if(r>0x1.fffffep+127) errno = ERANGE;
    return r;
  }

since 0x2.30a44p+4 = 0x1.18522p+5, r should be rounded towards zero to FLT_MAX
and not inf. I'll change to math_narrow_eval (FLT_MAX * FLT_MAX) as in the original
code.

> Also, for this license we will need a Sign-off by you (or by the original
> author) and I think this license should be compatible to be accepted.
> Carlos said he will check this out as well.

of course.

> [1] https://www.delorie.com/trybots/32bit/36770/math-test-float-tgamma.out

Best regards,
Paul
Paul Zimmermann July 29, 2024, 3:23 p.m. UTC | #4
Dear Adhemerval,

looking at the following failures:

Failure: tgamma (-0x1.86a08p+16): errno set to 0, expected 34 (ERANGE)
Failure: tgamma (-0x1.f3fffep+8): errno set to 0, expected 34 (ERANGE)

I don't understand them either. The code has for negative non-integer numbers:

  if(__builtin_expect(x<-47.0f, 0)){
    static const float sgn[2] = {0x1p-127f, -0x1p-127f};
    float r = 0x1p-127f * sgn[k&1];
    if(r == 0.0f) errno = ERANGE;
    return r;
  }

then for rounding to nearest r should always be 0, thus errno should be set to ERANGE.

Paul
Florian Weimer July 29, 2024, 3:44 p.m. UTC | #5
* Paul Zimmermann:

>        Dear Adhemerval,
>
> looking at the following failures:
>
> Failure: tgamma (-0x1.86a08p+16): errno set to 0, expected 34 (ERANGE)
> Failure: tgamma (-0x1.f3fffep+8): errno set to 0, expected 34 (ERANGE)
>
> I don't understand them either. The code has for negative non-integer numbers:
>
>   if(__builtin_expect(x<-47.0f, 0)){
>     static const float sgn[2] = {0x1p-127f, -0x1p-127f};
>     float r = 0x1p-127f * sgn[k&1];
>     if(r == 0.0f) errno = ERANGE;
>     return r;
>   }
>
> then for rounding to nearest r should always be 0, thus errno should be set to ERANGE.

Maybe an i387 excess precision issue?

Thanks,
Florian
Paul Zimmermann July 29, 2024, 3:55 p.m. UTC | #6
thank you Florian. Now I believe one should always have errno = ERANGE in this case,
as indicated by the C standard:

Likewise, a range error occurs if and only if the result overflows or underflows...

The result underflows if a nonzero result value with ordinary accuracy would have magnitude
(absolute value) less than the minimum normalized number in the type...

Paul

> From: Florian Weimer <fweimer@redhat.com>
> Cc: Adhemerval Zanella Netto <adhemerval.zanella@linaro.org>,
>   libc-alpha@sourceware.org,  carlos@redhat.com,  sibid@uvic.ca
> Date: Mon, 29 Jul 2024 17:44:18 +0200
> 
> * Paul Zimmermann:
> 
> >        Dear Adhemerval,
> >
> > looking at the following failures:
> >
> > Failure: tgamma (-0x1.86a08p+16): errno set to 0, expected 34 (ERANGE)
> > Failure: tgamma (-0x1.f3fffep+8): errno set to 0, expected 34 (ERANGE)
> >
> > I don't understand them either. The code has for negative non-integer numbers:
> >
> >   if(__builtin_expect(x<-47.0f, 0)){
> >     static const float sgn[2] = {0x1p-127f, -0x1p-127f};
> >     float r = 0x1p-127f * sgn[k&1];
> >     if(r == 0.0f) errno = ERANGE;
> >     return r;
> >   }
> >
> > then for rounding to nearest r should always be 0, thus errno should be set to ERANGE.
> 
> Maybe an i387 excess precision issue?
> 
> Thanks,
> Florian
> 
>
Joseph Myers July 29, 2024, 10:30 p.m. UTC | #7
On Mon, 29 Jul 2024, Paul Zimmermann wrote:

> The CORE-MATH implementation is correctly rounded (for any rounding mode).
> This can be checked by exhaustive tests in a few minutes since there are
> less than 2^32 values to check (against GNU MPFR for example).

What platforms have you tested on?  (That is, tested *with the glibc 
testsuite*, which covers correct exceptions and errno setting in 
accordance with glibc expectations.)  Those should include at least one 
platform without excess precision (e.g. x86_64) and one with excess 
precision (32-bit x86).

> diff --git a/math/w_tgammaf_compat.c b/math/w_tgammaf_compat.c
> index 34e0e096e0..b5208c54d5 100644
> --- a/math/w_tgammaf_compat.c
> +++ b/math/w_tgammaf_compat.c

These compat wrapper changes are suspect.  The ABI for the compat wrappers 
is that they support SVID error handling (calling __kernel_standard* on 
error in order to call a user-provided matherr function).  (This is only 
relevant for old binaries, not new ones, as the symbols required for SVID 
error handling are compat symbols so you can't link new binaries to use it 
without symbol versioning tricks that shouldn't be used outside the glibc 
testsuite.)

We don't systematically test SVID error handling for all functions in the 
glibc testsuite, only a few (see test-matherr.c and test-matherr-2.c).  
Maybe we *should* add tests of SVID error handling for all functions (and 
all the supported error cases for each function) to avoid patches 
mistakenly changing the compat wrappers in ways that break compatibility - 
if we'd had a test of SVID error handling for tgammaf, I expect it would 
have detected breakage from thie change to a compat wrapper.

> +  b32u32_u t = {.f = x};
> +  uint32_t ax = t.u<<1;
> +  if(__builtin_expect(ax>=(0xffu<<24), 0)){
> +    if(ax==(0xffu<<24)){
> +      if(t.u>>31){
> +	errno = EDOM;
> +	return __builtin_nanf("12");
> +      }

I think this is about the -Inf case?

(a) Please include comments explaining what cases are being handled 
whenever the conditions are determined by bit manipulation rather than 
straight comparisons of floating-point values.  The old code has comments 
such as

/* Return value for integer x < 0 is NaN with invalid exception.  */

and

/* x == -Inf.  According to ISO this is NaN.  */

and

/* Positive infinity (return positive infinity) or NaN (return NaN).  */

and it's not appropriate to regress this by introducing less-documented 
code.  More generally, lots more comments throughout the code, explaining 
what it's doing (particular cases being handled, overall evaluation 
strategy for the function in those cases), would be helpful.  Not comments 
repeating the obvious meaning of an individual line of C code, but 
comments of the level of "handle special case X" or "evaluate tgammaf on 
interval [A, B] using a degree-C polynomial approximation" or "handle hard 
cases for correct rounding, listed in table X" or similar, so that the 
reader can quickly understand the overall intent of each part of the code.

(b) errno is handled by the wrapper (w_tgamma_template.c); you shouldn't 
need to handle it here.

(c) On the other hand, the wrapper assumes that the individual function 
implementation raised appropriate exceptions.  How does this code ensure 
that "invalid" is raised?  Have you tested with the glibc testsuite, which 
verifies the "invalid" exception for negative infinity?  I'd expect 
something like "return x - x;" to ensure an appropriate exception is 
raised.

    TEST_f_f (tgamma, 0, plus_infty, DIVIDE_BY_ZERO_EXCEPTION|ERRNO_ERANGE),

> +    return x; // nan

And this is for negative NaN.  But you need to ensure that a signaling NaN 
is converted to quiet with "invalid" raised, so something more like 
"return x - x;" would be appropriate here as well.  The glibc testsuite 
includes tests for this as well.

> +  }
> +  double z = x;
> +  if(__builtin_expect(ax<0x6d000000u, 0)){

Again, please add comments explaining any comparisons based on bit 
patterns.

> +    volatile double d = (0x1.fa658c23b1578p-1 - 0x1.d0a118f324b63p-1*z)*z - 0x1.2788cfc6fb619p-1;
> +    double f = 1.0/z + d;
> +    float r = f;
> +    if(__builtin_fabs(r)>0x1.fffffep+127f) errno = ERANGE;

Any time you call a double function on a float value is suspect 
(__builtin_fabs here), because doing so is usually a bug.  If it's 
deliberate, it should have a comment.  If you're meaning to deal with 
excess precision in some way, use of math_narrow_eval is normal in glibc 
code.

> +  float fx = __builtin_floor(x);

Likewise, if you need to use the double function rather than the float one 
then please explain why in a comment.  And normally we'd use floorf etc. 
directly rather than __builtin_*.

> +  int k = fx;
> +  if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
> +    float r = 0x1p127f * 0x1p127f;
> +    if(r>0x1.fffffep+127) errno = ERANGE;
> +    return r;

Definitely a case for math_narrow_eval.  As everywhere, there should be no 
need to set errno because the wrappers handle that.

> +  }
> +  if(__builtin_expect(fx==x, 0)){

Throughout, in glibc code it's better to use GNU coding style, and glibc 
macros such as __glibc_unlikely rather than raw __builtin_expect, unless 
you expect this code to be copied verbatim from upstream in future with no 
local changes at all.

> +    if(x < 0.0f) {
> +      errno = EDOM;
> +      return __builtin_nanf("12");

As elsewere, you need to compute a NaN in a way that raises an exception.

> +  double m = z - 0x1.7p+1, i = __builtin_roundeven(m), step = __builtin_copysign(1.0,i);

Normally the relevant functions would be used directly without __builtin.

When benchmarking, try to include platforms where roundeven is not inlined 
and so the out-of-line roundeven function in glibc is called here - 
especially where that means the C implementation rather than a 
processor-specific IFUNC / assembly implementation - as that may 
significantly affect performance.  And state explicitly what platform you 
benchmarked on (both how the compiler building glibc is configured, and 
what processor is in use at runtime as that may affect IFUNCs used).

It's likely this is substantially faster than the existing code even with 
the use of the C roundeven implementation, but the choice of roundeven 
implementation could be significant enough it's important to consider it 
in benchmarking.

> +  if(__builtin_expect(((rt.u+2)&0xfffffff) < 8, 0)){
> +    for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++) {
> +      if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
>      }

I don't think shadowing variables is very good style (you have a double 
variable i in scope from just above this).

> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> index 37d8998c71..29e3ca926a 100644
> --- a/sysdeps/x86_64/fpu/libm-test-ulps
> +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> @@ -2263,25 +2263,21 @@ double: 1
>  
>  Function: "tgamma":
>  double: 9
> -float: 8
>  float128: 4
>  ldouble: 5

Removing ulps entries like this is something purely mechanical it seems 
appropriate to apply to *all* libm-test-ulps files, not just one.  (Cf. 
how when adding the logp1 aliases I copied ulps from log1p for all 
libm-test-ulps files.)
Joseph Myers July 29, 2024, 10:38 p.m. UTC | #8
On Mon, 29 Jul 2024, Adhemerval Zanella Netto wrote:

> Thanks for working on this, however it seems there is real regressions on
> x86 32 bit bot [1].

I don't see how this could have worked for 64-bit either, given the lack 
of code to ensure required exceptions are raised (and sNaN converted to 
qNaN) in various special cases.

All patch submissions should say how they were tested (typically with the 
glibc testsuite on at least one platform, sometimes with 
build-many-glibcs.py if compilation testing suffices - but in any case, it 
should be clear from the patch submission whether the testsuite was run 
including execution testing, and on what platforms if so).

> > diff --git a/benchtests/strcoll-inputs/filelist#en_US.UTF-8 b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> > index 0d8f1c722b..93142aed97 100644
> > --- a/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> > +++ b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
> > @@ -5315,7 +5315,6 @@ s_isinf.c
> >  dbl2mpn.c
> >  atnat.h
> >  flt-32
> > -e_gammaf_r.c
> >  e_remainderf.c
> >  s_llroundf.c
> >  s_erff.c

This part of the change is also highly suspect.  There is no need to 
change benchmark input for strcoll just because that input was originally 
based on a list of files in glibc and you're changing one such file!  
Indeed such a change should be discouraged.
Joseph Myers July 29, 2024, 10:48 p.m. UTC | #9
On Mon, 29 Jul 2024, Adhemerval Zanella Netto wrote:

> Thanks for working on this, however it seems there is real regressions on
> x86 32 bit bot [1].

Also linknamespace failures for arm/aarch64.

It looks like the new file doesn't include <math.h> at all.  By not 
including <math.h>, it thereby misses out on the redirections in 
include/math.h that ensure that roundeven calls end up calling __roundeven 
if not expanded inline.
Paul Zimmermann July 30, 2024, 11:30 a.m. UTC | #10
Dear Joseph,

thank you for your very detailed review. Trying to integrate some CORE-MATH
code into GNU libc helps me to better learn the GNU libc architecture, and
the GNU libc test suite was helpful to improve the CORE-MATH code.
Some answers below.

> What platforms have you tested on?  (That is, tested *with the glibc 
> testsuite*, which covers correct exceptions and errno setting in 
> accordance with glibc expectations.)  Those should include at least one 
> platform without excess precision (e.g. x86_64) and one with excess 
> precision (32-bit x86).

I have tested on x86_64 under Linux, but "make check" did halt prematurely.
After I deleted the build directory, now I can do "make check" normally.

> These compat wrapper changes are suspect.  The ABI for the compat wrappers 
> is that they support SVID error handling (calling __kernel_standard* on 
> error in order to call a user-provided matherr function).  (This is only 
> relevant for old binaries, not new ones, as the symbols required for SVID 
> error handling are compat symbols so you can't link new binaries to use it 
> without symbol versioning tricks that shouldn't be used outside the glibc 
> testsuite.)

I've restored the original compat wrapper, with one exception: the signgam
variable was passed by reference to the __ieee754_gammaf_r() function, which
did put in it the sign of gamma(x). Since the CORE-MATH code directly computes
the correct sign, this is no longer needed. But I have kept the signgam
variable for binary compatibility.

> We don't systematically test SVID error handling for all functions in the 
> glibc testsuite, only a few (see test-matherr.c and test-matherr-2.c).  
> Maybe we *should* add tests of SVID error handling for all functions (and 
> all the supported error cases for each function) to avoid patches 
> mistakenly changing the compat wrappers in ways that break compatibility - 
> if we'd had a test of SVID error handling for tgammaf, I expect it would 
> have detected breakage from thie change to a compat wrapper.

yes it would be good to have such tests.

> (a) Please include comments explaining what cases are being handled 
> whenever the conditions are determined by bit manipulation rather than 
> straight comparisons of floating-point values.  The old code has comments 
> such as

I have included many comments.

> (b) errno is handled by the wrapper (w_tgamma_template.c); you shouldn't 
> need to handle it here.

indeed, now the wrapper takes care of errno, thus I have removed the
corresponding instructions.

> (c) On the other hand, the wrapper assumes that the individual function 
> implementation raised appropriate exceptions.  How does this code ensure 
> that "invalid" is raised?  Have you tested with the glibc testsuite, which 
> verifies the "invalid" exception for negative infinity?  I'd expect 
> something like "return x - x;" to ensure an appropriate exception is 
> raised.

this is ok now, all tests do pass.

> > +    if(__builtin_fabs(r)>0x1.fffffep+127f) errno = ERANGE;
> 
> Any time you call a double function on a float value is suspect 
> (__builtin_fabs here), because doing so is usually a bug.  If it's 
> deliberate, it should have a comment.  If you're meaning to deal with 
> excess precision in some way, use of math_narrow_eval is normal in glibc 
> code.

it should have been __builtin_fabsf indeed. This has gone thanks to the
compat wrapper. But I have ported the change back upstream.

> > +  float fx = __builtin_floor(x);
> 
> Likewise, if you need to use the double function rather than the float one 
> then please explain why in a comment.  And normally we'd use floorf etc. 
> directly rather than __builtin_*.

again, it should be __builtin_floorf

> > +  int k = fx;
> > +  if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
> > +    float r = 0x1p127f * 0x1p127f;
> > +    if(r>0x1.fffffep+127) errno = ERANGE;
> > +    return r;
> 
> Definitely a case for math_narrow_eval.  As everywhere, there should be no 
> need to set errno because the wrappers handle that.

I'm not used to math_narrow_eval, and I prefer not to diverge too much from
upstream, to make maintenance easier in the future.

> > +  }
> > +  if(__builtin_expect(fx==x, 0)){
> 
> Throughout, in glibc code it's better to use GNU coding style, and glibc 
> macros such as __glibc_unlikely rather than raw __builtin_expect, unless 
> you expect this code to be copied verbatim from upstream in future with no 
> local changes at all.

indeed I prefer to stay near from the upstream code

> > +    if(x < 0.0f) {
> > +      errno = EDOM;
> > +      return __builtin_nanf("12");
> 
> As elsewere, you need to compute a NaN in a way that raises an exception.
> 
> > +  double m = z - 0x1.7p+1, i = __builtin_roundeven(m), step = __builtin_copysign(1.0,i);
> 
> Normally the relevant functions would be used directly without __builtin.

idem

> When benchmarking, try to include platforms where roundeven is not inlined 
> and so the out-of-line roundeven function in glibc is called here - 
> especially where that means the C implementation rather than a 
> processor-specific IFUNC / assembly implementation - as that may 
> significantly affect performance.  And state explicitly what platform you 
> benchmarked on (both how the compiler building glibc is configured, and 
> what processor is in use at runtime as that may affect IFUNCs used).

do you know any platform on the GCC compile farm where roundeven is not
inlined?

> It's likely this is substantially faster than the existing code even with 
> the use of the C roundeven implementation, but the choice of roundeven 
> implementation could be significant enough it's important to consider it 
> in benchmarking.
> 
> > +  if(__builtin_expect(((rt.u+2)&0xfffffff) < 8, 0)){
> > +    for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++) {
> > +      if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
> >      }
> 
> I don't think shadowing variables is very good style (you have a double 
> variable i in scope from just above this).

thank you, fixed (and also upstream)

> > diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
> > index 37d8998c71..29e3ca926a 100644
> > --- a/sysdeps/x86_64/fpu/libm-test-ulps
> > +++ b/sysdeps/x86_64/fpu/libm-test-ulps
> > @@ -2263,25 +2263,21 @@ double: 1
> >  
> >  Function: "tgamma":
> >  double: 9
> > -float: 8
> >  float128: 4
> >  ldouble: 5
> 
> Removing ulps entries like this is something purely mechanical it seems 
> appropriate to apply to *all* libm-test-ulps files, not just one.  (Cf. 
> how when adding the logp1 aliases I copied ulps from log1p for all 
> libm-test-ulps files.)

I have removed tgamma/float entries in all libm-test-ulps files

A v2 will follow.

Paul
Vincent Lefevre July 30, 2024, 12:10 p.m. UTC | #11
On 2024-07-30 13:30:39 +0200, Paul Zimmermann wrote:
> > > +  int k = fx;
> > > +  if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
> > > +    float r = 0x1p127f * 0x1p127f;
> > > +    if(r>0x1.fffffep+127) errno = ERANGE;
> > > +    return r;
> > 
> > Definitely a case for math_narrow_eval.  As everywhere, there should be no 
> > need to set errno because the wrappers handle that.
> 
> I'm not used to math_narrow_eval, and I prefer not to diverge too much from
> upstream, to make maintenance easier in the future.

But if I understand correctly, this means that on 32-bit x86
(without SSE2), the result will be evaluated and kept as a
double-precision value. This is not what you want.
Andreas Schwab July 30, 2024, 1:30 p.m. UTC | #12
On Jul 30 2024, Paul Zimmermann wrote:

> I've restored the original compat wrapper, with one exception: the signgam
> variable was passed by reference to the __ieee754_gammaf_r() function, which
> did put in it the sign of gamma(x). Since the CORE-MATH code directly computes
> the correct sign, this is no longer needed. But I have kept the signgam
> variable for binary compatibility.

Doesn't that break binary compatibility of __gammaf_r_finite?
Joseph Myers July 30, 2024, 1:49 p.m. UTC | #13
On Tue, 30 Jul 2024, Andreas Schwab wrote:

> On Jul 30 2024, Paul Zimmermann wrote:
> 
> > I've restored the original compat wrapper, with one exception: the signgam
> > variable was passed by reference to the __ieee754_gammaf_r() function, which
> > did put in it the sign of gamma(x). Since the CORE-MATH code directly computes
> > the correct sign, this is no longer needed. But I have kept the signgam
> > variable for binary compatibility.
> 
> Doesn't that break binary compatibility of __gammaf_r_finite?

I think the binary compatibility requirement for __gammaf_r_finite is for 
users who previously got a call from <bits/math-finite.h>.  The tgammaf 
implementation in <bits/math-finite.h>, before it was deleted, set its 
local variable to 0 before calling __gammaf_r_finite.  So I think it's OK 
for __gammaf_r_finite to return a result with the correct sign and ignore 
the pointer passed in.  (Indeed, in the case of small negative x the 
existing implementation returns a negative result with *signgamp set to 0, 
rather than returning a positive result for the caller to negate.)
Paul Zimmermann July 30, 2024, 2:01 p.m. UTC | #14
Hi Andreas,

> > I've restored the original compat wrapper, with one exception: the signgam
> > variable was passed by reference to the __ieee754_gammaf_r() function, which
> > did put in it the sign of gamma(x). Since the CORE-MATH code directly computes
> > the correct sign, this is no longer needed. But I have kept the signgam
> > variable for binary compatibility.
> 
> Doesn't that break binary compatibility of __gammaf_r_finite?

I don't think so, since __ieee754_gammaf_r() is only used from
math/w_tgammaf_compat.c.

Paul
Joseph Myers July 30, 2024, 2:02 p.m. UTC | #15
On Tue, 30 Jul 2024, Paul Zimmermann wrote:

> > When benchmarking, try to include platforms where roundeven is not inlined 
> > and so the out-of-line roundeven function in glibc is called here - 
> > especially where that means the C implementation rather than a 
> > processor-specific IFUNC / assembly implementation - as that may 
> > significantly affect performance.  And state explicitly what platform you 
> > benchmarked on (both how the compiler building glibc is configured, and 
> > what processor is in use at runtime as that may affect IFUNCs used).
> 
> do you know any platform on the GCC compile farm where roundeven is not
> inlined?

Most platforms do not inline roundeven.  GCC only supports inlining it at 
all for aarch64/arm/x86/loongarch/riscv, so if you have access to 
powerpc64le, for example, that will use the C roundeven implementation, 
out of line.  As I said, I expect the benchmarks still to be positive in 
this case, but it's worth testing.
Andreas Schwab July 30, 2024, 2:13 p.m. UTC | #16
On Jul 30 2024, Paul Zimmermann wrote:

>        Hi Andreas,
>
>> > I've restored the original compat wrapper, with one exception: the signgam
>> > variable was passed by reference to the __ieee754_gammaf_r() function, which
>> > did put in it the sign of gamma(x). Since the CORE-MATH code directly computes
>> > the correct sign, this is no longer needed. But I have kept the signgam
>> > variable for binary compatibility.
>> 
>> Doesn't that break binary compatibility of __gammaf_r_finite?
>
> I don't think so, since __ieee754_gammaf_r() is only used from
> math/w_tgammaf_compat.c.

It also has a libm_alias_finite alias in e_gammaf_r.c.
diff mbox series

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 382fb7bae1..265ad34d8d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -94,6 +94,7 @@  bench-math := \
   tan \
   tanh \
   tgamma \
+  tgammaf \
   trunc \
   truncf \
   y0 \
diff --git a/benchtests/strcoll-inputs/filelist#en_US.UTF-8 b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
index 0d8f1c722b..93142aed97 100644
--- a/benchtests/strcoll-inputs/filelist#en_US.UTF-8
+++ b/benchtests/strcoll-inputs/filelist#en_US.UTF-8
@@ -5315,7 +5315,6 @@  s_isinf.c
 dbl2mpn.c
 atnat.h
 flt-32
-e_gammaf_r.c
 e_remainderf.c
 s_llroundf.c
 s_erff.c
diff --git a/math/w_tgammaf_compat.c b/math/w_tgammaf_compat.c
index 34e0e096e0..b5208c54d5 100644
--- a/math/w_tgammaf_compat.c
+++ b/math/w_tgammaf_compat.c
@@ -2,14 +2,28 @@ 
  */
 
 /*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
+Copyright (c) 2023 Alexei Sibidanov.
+
+This file was copied from the CORE-MATH project
+(file src/binary32/tgamma/tgammaf.c, revision 673c2af)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
  */
 
 #include <errno.h>
@@ -22,26 +36,8 @@ 
 float
 __tgammaf(float x)
 {
-	int local_signgam;
-	float y = __ieee754_gammaf_r(x,&local_signgam);
-
-	if(__glibc_unlikely (!isfinite (y) || y == 0)
-	   && (isfinite (x) || (isinf (x) && x < 0.0))
-	   && _LIB_VERSION != _IEEE_) {
-	  if (x == (float)0.0)
-	    /* tgammaf pole */
-	    return __kernel_standard_f(x, x, 150);
-	  else if(floorf(x)==x&&x<0.0f)
-	    /* tgammaf domain */
-	    return __kernel_standard_f(x, x, 141);
-	  else if (y == 0)
-	    /* tgammaf underflow */
-	    __set_errno (ERANGE);
-	  else
-	    /* tgammaf overflow */
-	    return __kernel_standard_f(x, x, 140);
-	}
-	return local_signgam < 0 ? - y : y;
+  int e;
+  return __ieee754_gammaf_r (x, &e);
 }
 libm_alias_float (__tgamma, tgamma)
 #endif
diff --git a/sysdeps/ieee754/flt-32/e_gammaf_r.c b/sysdeps/ieee754/flt-32/e_gammaf_r.c
index a9730d61c1..9df66c7757 100644
--- a/sysdeps/ieee754/flt-32/e_gammaf_r.c
+++ b/sysdeps/ieee754/flt-32/e_gammaf_r.c
@@ -1,215 +1,130 @@ 
-/* Implementation of gamma function according to ISO C.
-   Copyright (C) 1997-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+/* Implementation of the gamma function for binary32.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+Copyright (c) 2023-2024 Alexei Sibidanov.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+This file was copied from the CORE-MATH project
+(file src/binary32/tgamma/tgammaf.c, revision a48e352)
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-#include <math.h>
-#include <math-narrow-eval.h>
-#include <math_private.h>
-#include <fenv_private.h>
-#include <math-underflow.h>
-#include <float.h>
-#include <libm-alias-finite.h>
-
-/* Coefficients B_2k / 2k(2k-1) of x^-(2k-1) inside exp in Stirling's
-   approximation to gamma function.  */
-
-static const float gamma_coeff[] =
-  {
-    0x1.555556p-4f,
-    -0xb.60b61p-12f,
-    0x3.403404p-12f,
-  };
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
-#define NCOEFF (sizeof (gamma_coeff) / sizeof (gamma_coeff[0]))
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+ */
 
-/* Return gamma (X), for positive X less than 42, in the form R *
-   2^(*EXP2_ADJ), where R is the return value and *EXP2_ADJ is set to
-   avoid overflow or underflow in intermediate calculations.  */
+#include <stdint.h>
+#include <errno.h>
+#include <libm-alias-finite.h>
 
-static float
-gammaf_positive (float x, int *exp2_adj)
-{
-  int local_signgam;
-  if (x < 0.5f)
-    {
-      *exp2_adj = 0;
-      return __ieee754_expf (__ieee754_lgammaf_r (x + 1, &local_signgam)) / x;
-    }
-  else if (x <= 1.5f)
-    {
-      *exp2_adj = 0;
-      return __ieee754_expf (__ieee754_lgammaf_r (x, &local_signgam));
-    }
-  else if (x < 2.5f)
-    {
-      *exp2_adj = 0;
-      float x_adj = x - 1;
-      return (__ieee754_expf (__ieee754_lgammaf_r (x_adj, &local_signgam))
-	      * x_adj);
-    }
-  else
-    {
-      float eps = 0;
-      float x_eps = 0;
-      float x_adj = x;
-      float prod = 1;
-      if (x < 4.0f)
-	{
-	  /* Adjust into the range for applying Stirling's
-	     approximation.  */
-	  float n = ceilf (4.0f - x);
-	  x_adj = math_narrow_eval (x + n);
-	  x_eps = (x - (x_adj - n));
-	  prod = __gamma_productf (x_adj - n, x_eps, n, &eps);
-	}
-      /* The result is now gamma (X_ADJ + X_EPS) / (PROD * (1 + EPS)).
-	 Compute gamma (X_ADJ + X_EPS) using Stirling's approximation,
-	 starting by computing pow (X_ADJ, X_ADJ) with a power of 2
-	 factored out.  */
-      float exp_adj = -eps;
-      float x_adj_int = roundf (x_adj);
-      float x_adj_frac = x_adj - x_adj_int;
-      int x_adj_log2;
-      float x_adj_mant = __frexpf (x_adj, &x_adj_log2);
-      if (x_adj_mant < M_SQRT1_2f)
-	{
-	  x_adj_log2--;
-	  x_adj_mant *= 2.0f;
-	}
-      *exp2_adj = x_adj_log2 * (int) x_adj_int;
-      float ret = (__ieee754_powf (x_adj_mant, x_adj)
-		   * __ieee754_exp2f (x_adj_log2 * x_adj_frac)
-		   * __ieee754_expf (-x_adj)
-		   * sqrtf (2 * M_PIf / x_adj)
-		   / prod);
-      exp_adj += x_eps * __ieee754_logf (x_adj);
-      float bsum = gamma_coeff[NCOEFF - 1];
-      float x_adj2 = x_adj * x_adj;
-      for (size_t i = 1; i <= NCOEFF - 1; i++)
-	bsum = bsum / x_adj2 + gamma_coeff[NCOEFF - 1 - i];
-      exp_adj += bsum / x_adj;
-      return ret + ret * __expm1f (exp_adj);
-    }
-}
+typedef union {float f; uint32_t u;} b32u32_u;
+typedef union {double f; uint64_t u;} b64u64_u;
 
 float
-__ieee754_gammaf_r (float x, int *signgamp)
+__ieee754_gammaf_r (float x, int *exp2_adj)
 {
-  int32_t hx;
-  float ret;
-
-  GET_FLOAT_WORD (hx, x);
+  static const struct {b32u32_u x; float f, df;} tb[] = {
+    {{.u = 0x27de86a9u}, 0x1.268266p+47f, 0x1p22f},
+    {{.u = 0x27e05475u}, 0x1.242422p+47f, 0x1p22f},
+    {{.u = 0xb63befb3u}, -0x1.5cb6e4p+18f, 0x1p-7f},
+    {{.u = 0x3c7bb570u}, 0x1.021d9p+6f, 0x1p-19f},
+    {{.u = 0x41e886d1u}, 0x1.33136ap+98f, 0x1p73f},
+    {{.u = 0xc067d177u}, 0x1.f6850cp-3f, 0x1p-28f},
+    {{.f = -0x1.33b462p-4}, -0x1.befe66p+3, -0x1p-22f},
+    {{.f = -0x1.a988b4p-1}, -0x1.a6b4ecp+2, +0x1p-23f},
+    {{.f = 0x1.dceffcp+4}, 0x1.d3631cp+101, -0x1p-76f},
+    {{.f = 0x1.0874c8p+0}, 0x1.f6c638p-1, 0x1p-26f},
+  };
 
-  if (__glibc_unlikely ((hx & 0x7fffffff) == 0))
-    {
-      /* Return value for x == 0 is Inf with divide by zero exception.  */
-      *signgamp = 0;
-      return 1.0 / x;
+  b32u32_u t = {.f = x};
+  uint32_t ax = t.u<<1;
+  if(__builtin_expect(ax>=(0xffu<<24), 0)){
+    if(ax==(0xffu<<24)){
+      if(t.u>>31){
+	errno = EDOM;
+	return __builtin_nanf("12");
+      }
+      return x;
     }
-  if (__builtin_expect (hx < 0, 0)
-      && (uint32_t) hx < 0xff800000 && rintf (x) == x)
-    {
-      /* Return value for integer x < 0 is NaN with invalid exception.  */
-      *signgamp = 0;
-      return (x - x) / (x - x);
+    return x; // nan
+  }
+  double z = x;
+  if(__builtin_expect(ax<0x6d000000u, 0)){
+    volatile double d = (0x1.fa658c23b1578p-1 - 0x1.d0a118f324b63p-1*z)*z - 0x1.2788cfc6fb619p-1;
+    double f = 1.0/z + d;
+    float r = f;
+    if(__builtin_fabs(r)>0x1.fffffep+127f) errno = ERANGE;
+    b64u64_u rt = {.f = f};
+    if(((rt.u+2)&0xfffffff) < 4){
+      for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++)
+	if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
     }
-  if (__glibc_unlikely (hx == 0xff800000))
-    {
-      /* x == -Inf.  According to ISO this is NaN.  */
-      *signgamp = 0;
-      return x - x;
+    return r;
+  }
+  float fx = __builtin_floor(x);
+  int k = fx;
+  if(__builtin_expect(x >= 0x1.18522p+5f, 0)){
+    float r = 0x1p127f * 0x1p127f;
+    if(r>0x1.fffffep+127) errno = ERANGE;
+    return r;
+  }
+  if(__builtin_expect(fx==x, 0)){
+    if(x == 0.0f){
+      errno = ERANGE;
+      return 1.0f/x;
     }
-  if (__glibc_unlikely ((hx & 0x7f800000) == 0x7f800000))
-    {
-      /* Positive infinity (return positive infinity) or NaN (return
-	 NaN).  */
-      *signgamp = 0;
-      return x + x;
+    if(x < 0.0f) {
+      errno = EDOM;
+      return __builtin_nanf("12");
     }
+    double t0 = 1, x0 = 1;
+    for(int i=1; i<k; i++, x0 += 1.0) t0 *= x0;
+    return t0;
+  }
+  if(__builtin_expect(x<-47.0f, 0)){
+    static const float sgn[2] = {0x1p-127, -0x1p-127};
+    float r = 0x1p-127f * sgn[k&1];
+    if(r == 0.0f) errno = ERANGE;
+    return r;
+  }
+  static const double c[] =
+    {0x1.c9a76be577123p+0, 0x1.8f2754ddcf90dp+0, 0x1.0d1191949419bp+0, 0x1.e1f42cf0ae4a1p-2,
+     0x1.82b358a3ab638p-3, 0x1.e1f2b30cd907bp-5, 0x1.240f6d4071bd8p-6, 0x1.1522c9f3cd012p-8,
+     0x1.1fd0051a0525bp-10, 0x1.9808a8b96c37ep-13, 0x1.b3f78e01152b5p-15, 0x1.49c85a7e1fd04p-18,
+     0x1.471ca49184475p-19, -0x1.368f0b7ed9e36p-23, 0x1.882222f9049efp-23, -0x1.a69ed2042842cp-25};
 
-  if (x >= 36.0f)
-    {
-      /* Overflow.  */
-      *signgamp = 0;
-      ret = math_narrow_eval (FLT_MAX * FLT_MAX);
-      return ret;
-    }
-  else
-    {
-      SET_RESTORE_ROUNDF (FE_TONEAREST);
-      if (x > 0.0f)
-	{
-	  *signgamp = 0;
-	  int exp2_adj;
-	  float tret = gammaf_positive (x, &exp2_adj);
-	  ret = __scalbnf (tret, exp2_adj);
-	}
-      else if (x >= -FLT_EPSILON / 4.0f)
-	{
-	  *signgamp = 0;
-	  ret = 1.0f / x;
-	}
-      else
-	{
-	  float tx = truncf (x);
-	  *signgamp = (tx == 2.0f * truncf (tx / 2.0f)) ? -1 : 1;
-	  if (x <= -42.0f)
-	    /* Underflow.  */
-	    ret = FLT_MIN * FLT_MIN;
-	  else
-	    {
-	      float frac = tx - x;
-	      if (frac > 0.5f)
-		frac = 1.0f - frac;
-	      float sinpix = (frac <= 0.25f
-			      ? __sinf (M_PIf * frac)
-			      : __cosf (M_PIf * (0.5f - frac)));
-	      int exp2_adj;
-	      float tret = M_PIf / (-x * sinpix
-				    * gammaf_positive (-x, &exp2_adj));
-	      ret = __scalbnf (tret, -exp2_adj);
-	      math_check_force_underflow_nonneg (ret);
-	    }
-	}
-      ret = math_narrow_eval (ret);
-    }
-  if (isinf (ret) && x != 0)
-    {
-      if (*signgamp < 0)
-	{
-	  ret = math_narrow_eval (-copysignf (FLT_MAX, ret) * FLT_MAX);
-	  ret = -ret;
-	}
-      else
-	ret = math_narrow_eval (copysignf (FLT_MAX, ret) * FLT_MAX);
-      return ret;
-    }
-  else if (ret == 0)
-    {
-      if (*signgamp < 0)
-	{
-	  ret = math_narrow_eval (-copysignf (FLT_MIN, ret) * FLT_MIN);
-	  ret = -ret;
-	}
-      else
-	ret = math_narrow_eval (copysignf (FLT_MIN, ret) * FLT_MIN);
-      return ret;
+  double m = z - 0x1.7p+1, i = __builtin_roundeven(m), step = __builtin_copysign(1.0,i);
+  double d = m - i, d2 = d*d, d4 = d2*d2, d8 = d4*d4;
+  double f = (c[0] + d*c[1]) + d2*(c[2] + d*c[3]) + d4*((c[4] + d*c[5]) + d2*(c[6] + d*c[7]))
+    + d8*((c[8] + d*c[9]) + d2*(c[10] + d*c[11]) + d4*((c[12] + d*c[13]) + d2*(c[14] + d*c[15])));
+  int jm = __builtin_fabs(i);
+  double w = 1;
+  if(jm){
+    z -= 0.5 + step*0.5;
+    w = z;
+    for(int j=jm-1; j; j--) {z -= step; w *= z;}
+  }
+  if(i<=-0.5) w = 1/w;
+  f *= w;
+  b64u64_u rt = {.f = f};
+  float r = f;
+  if(__builtin_expect(r==0.0f, 0)) errno = ERANGE;
+  if(__builtin_expect(((rt.u+2)&0xfffffff) < 8, 0)){
+    for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++) {
+      if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
     }
-  else
-    return ret;
+  }
+  return r;
 }
 libm_alias_finite (__ieee754_gammaf_r, __gammaf_r)
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 37d8998c71..29e3ca926a 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -2263,25 +2263,21 @@  double: 1
 
 Function: "tgamma":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 float128: 4
 ldouble: 5