diff mbox series

[v2] replace tgammaf by the CORE-MATH implementation

Message ID 20240730114241.578503-1-Paul.Zimmermann@inria.fr
State New
Headers show
Series [v2] replace tgammaf by the CORE-MATH implementation | expand

Commit Message

Paul Zimmermann July 30, 2024, 11:42 a.m. UTC
The CORE-MATH implementation is correctly rounded (for any rounding mode).
This can be checked by exhaustive tests in a few minutes since there are
less than 2^32 values to check against for example GNU MPFR.

Tested on x86_64 under Linux.

This patch also adds some bench values for tgammaf. With the initial
GNU libc code it gave on an Intel(R) Core(TM) i7-8700:

  "tgammaf": {
   "": {
    "duration": 3.50188e+09,
    "iterations": 2e+07,
    "max": 602.891,
    "min": 65.1415,
    "mean": 175.094
   }
  }

With the new code:

  "tgammaf": {
   "": {
    "duration": 3.30825e+09,
    "iterations": 5e+07,
    "max": 211.592,
    "min": 32.0325,
    "mean": 66.1649
   }
  }

Changes in v2:
- include <math.h> (fix the linknamespace failures)
- restored original benchtests/strcoll-inputs/filelist#en_US.UTF-8 file
- restored original wrapper code (math/w_tgammaf_compat.c),
  except for the dealing with the sign
- removed the tgammaf/float entries in all libm-test-ulps files
- address other comments from Joseph Myers
  (https://sourceware.org/pipermail/libc-alpha/2024-July/158736.html)
---
 benchtests/Makefile                           |   1 +
 math/w_tgammaf_compat.c                       |   3 +-
 sysdeps/aarch64/libm-test-ulps                |   4 -
 sysdeps/alpha/fpu/libm-test-ulps              |   4 -
 sysdeps/arc/fpu/libm-test-ulps                |   4 -
 sysdeps/arc/nofpu/libm-test-ulps              |   1 -
 sysdeps/arm/libm-test-ulps                    |   4 -
 sysdeps/csky/fpu/libm-test-ulps               |   4 -
 sysdeps/csky/nofpu/libm-test-ulps             |   4 -
 sysdeps/hppa/fpu/libm-test-ulps               |   4 -
 sysdeps/i386/fpu/libm-test-ulps               |   4 -
 .../i386/i686/fpu/multiarch/libm-test-ulps    |   4 -
 sysdeps/ieee754/flt-32/e_gammaf_r.c           | 310 +++++++-----------
 sysdeps/loongarch/lp64/libm-test-ulps         |   4 -
 sysdeps/m68k/coldfire/fpu/libm-test-ulps      |   1 -
 sysdeps/m68k/m680x0/fpu/libm-test-ulps        |   4 -
 sysdeps/microblaze/libm-test-ulps             |   1 -
 sysdeps/mips/mips32/libm-test-ulps            |   4 -
 sysdeps/mips/mips64/libm-test-ulps            |   4 -
 sysdeps/nios2/libm-test-ulps                  |   1 -
 sysdeps/or1k/fpu/libm-test-ulps               |   4 -
 sysdeps/or1k/nofpu/libm-test-ulps             |   4 -
 sysdeps/powerpc/fpu/libm-test-ulps            |   4 -
 sysdeps/powerpc/nofpu/libm-test-ulps          |   4 -
 sysdeps/riscv/nofpu/libm-test-ulps            |   4 -
 sysdeps/riscv/rvd/libm-test-ulps              |   4 -
 sysdeps/s390/fpu/libm-test-ulps               |   4 -
 sysdeps/sh/libm-test-ulps                     |   2 -
 sysdeps/sparc/fpu/libm-test-ulps              |   4 -
 sysdeps/x86_64/fpu/libm-test-ulps             |   4 -
 30 files changed, 117 insertions(+), 291 deletions(-)

Comments

Joseph Myers July 30, 2024, 1:58 p.m. UTC | #1
On Tue, 30 Jul 2024, Paul Zimmermann wrote:

> The CORE-MATH implementation is correctly rounded (for any rounding mode).
> This can be checked by exhaustive tests in a few minutes since there are
> less than 2^32 values to check against for example GNU MPFR.
> 
> Tested on x86_64 under Linux.

Testing for 32-bit x86 is also important, given excess range / precision 
issues.

> diff --git a/math/w_tgammaf_compat.c b/math/w_tgammaf_compat.c
> index 34e0e096e0..f58d3ad74e 100644
> --- a/math/w_tgammaf_compat.c
> +++ b/math/w_tgammaf_compat.c
> @@ -24,6 +24,7 @@ __tgammaf(float x)
>  {
>  	int local_signgam;
>  	float y = __ieee754_gammaf_r(x,&local_signgam);
> +        // local_signgam is not used, but we keep them for compatibility

I think just calling with a NULL argument would be sufficient, given that 
__ieee754_gammaf_r no longer uses the pointer.

> +  /* List of exceptional cases.  */
> +  static const struct {b32u32_u x; float f, df;} tb[] = {
> +    {{.u = 0x27de86a9u}, 0x1.268266p+47f, 0x1p22f},
> +    {{.u = 0x27e05475u}, 0x1.242422p+47f, 0x1p22f},
> +    {{.u = 0xb63befb3u}, -0x1.5cb6e4p+18f, 0x1p-7f},
> +    {{.u = 0x3c7bb570u}, 0x1.021d9p+6f, 0x1p-19f},
> +    {{.u = 0x41e886d1u}, 0x1.33136ap+98f, 0x1p73f},
> +    {{.u = 0xc067d177u}, 0x1.f6850cp-3f, 0x1p-28f},
> +    {{.f = -0x1.33b462p-4}, -0x1.befe66p+3, -0x1p-22f},
> +    {{.f = -0x1.a988b4p-1}, -0x1.a6b4ecp+2, +0x1p-23f},
> +    {{.f = 0x1.dceffcp+4}, 0x1.d3631cp+101, -0x1p-76f},
> +    {{.f = 0x1.0874c8p+0}, 0x1.f6c638p-1, 0x1p-26f},

It would be helpful to have a comment on each field of the struct 
definition saying what its semantics are.  And why do some use .u and some 
use .f in the initializers?  It looks to me like only .u ever actually 
gets used, in which case you could have a simple uint32_t field rather 
than using a union at all.

> +  float fx = __builtin_floorf(x);
> +  if(__builtin_expect(x >= 0x1.18522p+5f, 0))
> +    /* Overflow case. The original CORE-MATH code returns 0x1p127f * 0x1p127f,
> +       but apparently some compilers replace this by +Inf.  */
> +    return x * 0x1p127f;

In overflow and underflow cases it's generally best to use 
math_narrow_eval, so "return math_narrow_eval (x * 0x1p127f);" or similar.  
The wrappers may misbehave if an overflowing or underflowing result is 
returned with excess range.

> +    /* Underflows always happens */
> +    return 0x1p-127f * sgn[k&1];

Similarly, best to use math_narrow_eval here.

> +  static const double c[] =
> +    {0x1.c9a76be577123p+0, 0x1.8f2754ddcf90dp+0, 0x1.0d1191949419bp+0, 0x1.e1f42cf0ae4a1p-2,
> +     0x1.82b358a3ab638p-3, 0x1.e1f2b30cd907bp-5, 0x1.240f6d4071bd8p-6, 0x1.1522c9f3cd012p-8,
> +     0x1.1fd0051a0525bp-10, 0x1.9808a8b96c37ep-13, 0x1.b3f78e01152b5p-15, 0x1.49c85a7e1fd04p-18,
> +     0x1.471ca49184475p-19, -0x1.368f0b7ed9e36p-23, 0x1.882222f9049efp-23, -0x1.a69ed2042842cp-25};

A comment explaining the semantics of this array would be helpful.
Paul Zimmermann July 31, 2024, 7:48 a.m. UTC | #2
thank you again Joseph for your detailed review, I'll send a v3 addressing your comments.

Paul
diff mbox series

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 382fb7bae1..265ad34d8d 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -94,6 +94,7 @@  bench-math := \
   tan \
   tanh \
   tgamma \
+  tgammaf \
   trunc \
   truncf \
   y0 \
diff --git a/math/w_tgammaf_compat.c b/math/w_tgammaf_compat.c
index 34e0e096e0..f58d3ad74e 100644
--- a/math/w_tgammaf_compat.c
+++ b/math/w_tgammaf_compat.c
@@ -24,6 +24,7 @@  __tgammaf(float x)
 {
 	int local_signgam;
 	float y = __ieee754_gammaf_r(x,&local_signgam);
+        // local_signgam is not used, but we keep them for compatibility
 
 	if(__glibc_unlikely (!isfinite (y) || y == 0)
 	   && (isfinite (x) || (isinf (x) && x < 0.0))
@@ -41,7 +42,7 @@  __tgammaf(float x)
 	    /* tgammaf overflow */
 	    return __kernel_standard_f(x, x, 140);
 	}
-	return local_signgam < 0 ? - y : y;
+	return y;
 }
 libm_alias_float (__tgamma, tgamma)
 #endif
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index a7a4a94265..b557dee8e5 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1653,22 +1653,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/alpha/fpu/libm-test-ulps b/sysdeps/alpha/fpu/libm-test-ulps
index e28c2af683..5c6175406b 100644
--- a/sysdeps/alpha/fpu/libm-test-ulps
+++ b/sysdeps/alpha/fpu/libm-test-ulps
@@ -1410,22 +1410,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/arc/fpu/libm-test-ulps b/sysdeps/arc/fpu/libm-test-ulps
index 41c8ef16d7..8f4571b5fa 100644
--- a/sysdeps/arc/fpu/libm-test-ulps
+++ b/sysdeps/arc/fpu/libm-test-ulps
@@ -1093,19 +1093,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 9
 
 Function: "tgamma_downward":
 double: 9
-float: 9
 
 Function: "tgamma_towardzero":
 double: 9
-float: 8
 
 Function: "tgamma_upward":
 double: 9
-float: 9
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/arc/nofpu/libm-test-ulps b/sysdeps/arc/nofpu/libm-test-ulps
index d3f45957d4..6d1f97b3fc 100644
--- a/sysdeps/arc/nofpu/libm-test-ulps
+++ b/sysdeps/arc/nofpu/libm-test-ulps
@@ -262,7 +262,6 @@  float: 2
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps
index 6480353d39..9ca4750f74 100644
--- a/sysdeps/arm/libm-test-ulps
+++ b/sysdeps/arm/libm-test-ulps
@@ -1152,19 +1152,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/csky/fpu/libm-test-ulps b/sysdeps/csky/fpu/libm-test-ulps
index fc634f89ca..151df00638 100644
--- a/sysdeps/csky/fpu/libm-test-ulps
+++ b/sysdeps/csky/fpu/libm-test-ulps
@@ -1061,19 +1061,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 8
-float: 7
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/csky/nofpu/libm-test-ulps b/sysdeps/csky/nofpu/libm-test-ulps
index a1e28c8ee0..48697a7b09 100644
--- a/sysdeps/csky/nofpu/libm-test-ulps
+++ b/sysdeps/csky/nofpu/libm-test-ulps
@@ -1092,19 +1092,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 5
-float: 5
 
 Function: "tgamma_towardzero":
 double: 5
-float: 4
 
 Function: "tgamma_upward":
 double: 4
-float: 4
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/hppa/fpu/libm-test-ulps b/sysdeps/hppa/fpu/libm-test-ulps
index ea5101f6b6..2459868892 100644
--- a/sysdeps/hppa/fpu/libm-test-ulps
+++ b/sysdeps/hppa/fpu/libm-test-ulps
@@ -1181,20 +1181,16 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 1
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 03297e6527..5f12dd899a 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -1699,25 +1699,21 @@  ldouble: 4
 
 Function: "tgamma":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index 85a2456971..35fc9c36bd 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -1701,25 +1701,21 @@  ldouble: 4
 
 Function: "tgamma":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_upward":
 double: 8
-float: 8
 float128: 4
 ldouble: 5
 
diff --git a/sysdeps/ieee754/flt-32/e_gammaf_r.c b/sysdeps/ieee754/flt-32/e_gammaf_r.c
index a9730d61c1..53aae1a35b 100644
--- a/sysdeps/ieee754/flt-32/e_gammaf_r.c
+++ b/sysdeps/ieee754/flt-32/e_gammaf_r.c
@@ -1,215 +1,133 @@ 
-/* Implementation of gamma function according to ISO C.
-   Copyright (C) 1997-2024 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
+/* Implementation of the gamma function for binary32.
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+Copyright (c) 2023-2024 Alexei Sibidanov.
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+The original version of this file was copied from the CORE-MATH
+project (file src/binary32/tgamma/tgammaf.c, revision a48e352).
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-#include <math.h>
-#include <math-narrow-eval.h>
-#include <math_private.h>
-#include <fenv_private.h>
-#include <math-underflow.h>
-#include <float.h>
-#include <libm-alias-finite.h>
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
 
-/* Coefficients B_2k / 2k(2k-1) of x^-(2k-1) inside exp in Stirling's
-   approximation to gamma function.  */
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+ */
 
-static const float gamma_coeff[] =
-  {
-    0x1.555556p-4f,
-    -0xb.60b61p-12f,
-    0x3.403404p-12f,
-  };
+/* Changes with respect to the original CORE-MATH code:
+   - removed the dealing with errno
+     (this is done in the wrapper math/w_tgammaf_compat.c)
+   - the original wrapper assumes the sign of gamma(x) is put in
+     signgamp, but since this code directly returns the correct
+     sign, signgamp is not used any more (but kept for compatibility)
+ */
 
-#define NCOEFF (sizeof (gamma_coeff) / sizeof (gamma_coeff[0]))
+#include <math.h>
+#include <float.h>
+#include <stdint.h>
+#include <libm-alias-finite.h>
 
-/* Return gamma (X), for positive X less than 42, in the form R *
-   2^(*EXP2_ADJ), where R is the return value and *EXP2_ADJ is set to
-   avoid overflow or underflow in intermediate calculations.  */
-
-static float
-gammaf_positive (float x, int *exp2_adj)
-{
-  int local_signgam;
-  if (x < 0.5f)
-    {
-      *exp2_adj = 0;
-      return __ieee754_expf (__ieee754_lgammaf_r (x + 1, &local_signgam)) / x;
-    }
-  else if (x <= 1.5f)
-    {
-      *exp2_adj = 0;
-      return __ieee754_expf (__ieee754_lgammaf_r (x, &local_signgam));
-    }
-  else if (x < 2.5f)
-    {
-      *exp2_adj = 0;
-      float x_adj = x - 1;
-      return (__ieee754_expf (__ieee754_lgammaf_r (x_adj, &local_signgam))
-	      * x_adj);
-    }
-  else
-    {
-      float eps = 0;
-      float x_eps = 0;
-      float x_adj = x;
-      float prod = 1;
-      if (x < 4.0f)
-	{
-	  /* Adjust into the range for applying Stirling's
-	     approximation.  */
-	  float n = ceilf (4.0f - x);
-	  x_adj = math_narrow_eval (x + n);
-	  x_eps = (x - (x_adj - n));
-	  prod = __gamma_productf (x_adj - n, x_eps, n, &eps);
-	}
-      /* The result is now gamma (X_ADJ + X_EPS) / (PROD * (1 + EPS)).
-	 Compute gamma (X_ADJ + X_EPS) using Stirling's approximation,
-	 starting by computing pow (X_ADJ, X_ADJ) with a power of 2
-	 factored out.  */
-      float exp_adj = -eps;
-      float x_adj_int = roundf (x_adj);
-      float x_adj_frac = x_adj - x_adj_int;
-      int x_adj_log2;
-      float x_adj_mant = __frexpf (x_adj, &x_adj_log2);
-      if (x_adj_mant < M_SQRT1_2f)
-	{
-	  x_adj_log2--;
-	  x_adj_mant *= 2.0f;
-	}
-      *exp2_adj = x_adj_log2 * (int) x_adj_int;
-      float ret = (__ieee754_powf (x_adj_mant, x_adj)
-		   * __ieee754_exp2f (x_adj_log2 * x_adj_frac)
-		   * __ieee754_expf (-x_adj)
-		   * sqrtf (2 * M_PIf / x_adj)
-		   / prod);
-      exp_adj += x_eps * __ieee754_logf (x_adj);
-      float bsum = gamma_coeff[NCOEFF - 1];
-      float x_adj2 = x_adj * x_adj;
-      for (size_t i = 1; i <= NCOEFF - 1; i++)
-	bsum = bsum / x_adj2 + gamma_coeff[NCOEFF - 1 - i];
-      exp_adj += bsum / x_adj;
-      return ret + ret * __expm1f (exp_adj);
-    }
-}
+typedef union {float f; uint32_t u;} b32u32_u;
+typedef union {double f; uint64_t u;} b64u64_u;
 
 float
 __ieee754_gammaf_r (float x, int *signgamp)
 {
-  int32_t hx;
-  float ret;
-
-  GET_FLOAT_WORD (hx, x);
+  /* List of exceptional cases.  */
+  static const struct {b32u32_u x; float f, df;} tb[] = {
+    {{.u = 0x27de86a9u}, 0x1.268266p+47f, 0x1p22f},
+    {{.u = 0x27e05475u}, 0x1.242422p+47f, 0x1p22f},
+    {{.u = 0xb63befb3u}, -0x1.5cb6e4p+18f, 0x1p-7f},
+    {{.u = 0x3c7bb570u}, 0x1.021d9p+6f, 0x1p-19f},
+    {{.u = 0x41e886d1u}, 0x1.33136ap+98f, 0x1p73f},
+    {{.u = 0xc067d177u}, 0x1.f6850cp-3f, 0x1p-28f},
+    {{.f = -0x1.33b462p-4}, -0x1.befe66p+3, -0x1p-22f},
+    {{.f = -0x1.a988b4p-1}, -0x1.a6b4ecp+2, +0x1p-23f},
+    {{.f = 0x1.dceffcp+4}, 0x1.d3631cp+101, -0x1p-76f},
+    {{.f = 0x1.0874c8p+0}, 0x1.f6c638p-1, 0x1p-26f},
+  };
 
-  if (__glibc_unlikely ((hx & 0x7fffffff) == 0))
-    {
-      /* Return value for x == 0 is Inf with divide by zero exception.  */
-      *signgamp = 0;
-      return 1.0 / x;
+  b32u32_u t = {.f = x};
+  uint32_t ax = t.u<<1;
+  if(__builtin_expect(ax>=(0xffu<<24), 0)){ /* x=NaN or +/-Inf */
+    if(ax==(0xffu<<24)){ /* x=+/-Inf */
+      if(t.u>>31) /* x=-Inf */
+        return x / x; /* will raise the "Invalid operation" exception */
+      return x; /* x=+Inf */
     }
-  if (__builtin_expect (hx < 0, 0)
-      && (uint32_t) hx < 0xff800000 && rintf (x) == x)
-    {
-      /* Return value for integer x < 0 is NaN with invalid exception.  */
-      *signgamp = 0;
-      return (x - x) / (x - x);
-    }
-  if (__glibc_unlikely (hx == 0xff800000))
-    {
-      /* x == -Inf.  According to ISO this is NaN.  */
-      *signgamp = 0;
-      return x - x;
-    }
-  if (__glibc_unlikely ((hx & 0x7f800000) == 0x7f800000))
-    {
-      /* Positive infinity (return positive infinity) or NaN (return
-	 NaN).  */
-      *signgamp = 0;
-      return x + x;
+    return x + x; /* x=NaN, where x+x ensures the "Invalid operation"
+                     exception is set if x is sNaN */
+  }
+  double z = x;
+  if(__builtin_expect(ax<0x6d000000u, 0)){ /* |x| < 0x1p-18 */
+    volatile double d = (0x1.fa658c23b1578p-1 - 0x1.d0a118f324b63p-1*z)*z - 0x1.2788cfc6fb619p-1;
+    double f = 1.0/z + d;
+    float r = f;
+    b64u64_u rt = {.f = f};
+    if(((rt.u+2)&0xfffffff) < 4){
+      for(unsigned i=0;i<sizeof(tb)/sizeof(tb[0]);i++)
+	if(t.u==tb[i].x.u) return tb[i].f + tb[i].df;
     }
+    return r;
+  }
+  float fx = __builtin_floorf(x);
+  if(__builtin_expect(x >= 0x1.18522p+5f, 0))
+    /* Overflow case. The original CORE-MATH code returns 0x1p127f * 0x1p127f,
+       but apparently some compilers replace this by +Inf.  */
+    return x * 0x1p127f;
+  int k = fx;
+  if(__builtin_expect(fx==x, 0)){ /* x is integer */
+    if(x == 0.0f)
+      return 1.0f/x;
+    if(x < 0.0f)
+      return 0.0f / 0.0f; /* should raise the "Invalid operation" exception */
+    double t0 = 1, x0 = 1;
+    for(int i=1; i<k; i++, x0 += 1.0) t0 *= x0;
+    return t0;
+  }
+  if(__builtin_expect(x<-42.0f, 0)){ /* negative non-integer */
+    /* For x < -42, x non-integer, |gamma(x)| < 2^-151.  */
+    static const float sgn[2] = {0x1p-127f, -0x1p-127f};
+    /* Underflows always happens */
+    return 0x1p-127f * sgn[k&1];
+  }
+  static const double c[] =
+    {0x1.c9a76be577123p+0, 0x1.8f2754ddcf90dp+0, 0x1.0d1191949419bp+0, 0x1.e1f42cf0ae4a1p-2,
+     0x1.82b358a3ab638p-3, 0x1.e1f2b30cd907bp-5, 0x1.240f6d4071bd8p-6, 0x1.1522c9f3cd012p-8,
+     0x1.1fd0051a0525bp-10, 0x1.9808a8b96c37ep-13, 0x1.b3f78e01152b5p-15, 0x1.49c85a7e1fd04p-18,
+     0x1.471ca49184475p-19, -0x1.368f0b7ed9e36p-23, 0x1.882222f9049efp-23, -0x1.a69ed2042842cp-25};
 
-  if (x >= 36.0f)
-    {
-      /* Overflow.  */
-      *signgamp = 0;
-      ret = math_narrow_eval (FLT_MAX * FLT_MAX);
-      return ret;
-    }
-  else
-    {
-      SET_RESTORE_ROUNDF (FE_TONEAREST);
-      if (x > 0.0f)
-	{
-	  *signgamp = 0;
-	  int exp2_adj;
-	  float tret = gammaf_positive (x, &exp2_adj);
-	  ret = __scalbnf (tret, exp2_adj);
-	}
-      else if (x >= -FLT_EPSILON / 4.0f)
-	{
-	  *signgamp = 0;
-	  ret = 1.0f / x;
-	}
-      else
-	{
-	  float tx = truncf (x);
-	  *signgamp = (tx == 2.0f * truncf (tx / 2.0f)) ? -1 : 1;
-	  if (x <= -42.0f)
-	    /* Underflow.  */
-	    ret = FLT_MIN * FLT_MIN;
-	  else
-	    {
-	      float frac = tx - x;
-	      if (frac > 0.5f)
-		frac = 1.0f - frac;
-	      float sinpix = (frac <= 0.25f
-			      ? __sinf (M_PIf * frac)
-			      : __cosf (M_PIf * (0.5f - frac)));
-	      int exp2_adj;
-	      float tret = M_PIf / (-x * sinpix
-				    * gammaf_positive (-x, &exp2_adj));
-	      ret = __scalbnf (tret, -exp2_adj);
-	      math_check_force_underflow_nonneg (ret);
-	    }
-	}
-      ret = math_narrow_eval (ret);
-    }
-  if (isinf (ret) && x != 0)
-    {
-      if (*signgamp < 0)
-	{
-	  ret = math_narrow_eval (-copysignf (FLT_MAX, ret) * FLT_MAX);
-	  ret = -ret;
-	}
-      else
-	ret = math_narrow_eval (copysignf (FLT_MAX, ret) * FLT_MAX);
-      return ret;
-    }
-  else if (ret == 0)
-    {
-      if (*signgamp < 0)
-	{
-	  ret = math_narrow_eval (-copysignf (FLT_MIN, ret) * FLT_MIN);
-	  ret = -ret;
-	}
-      else
-	ret = math_narrow_eval (copysignf (FLT_MIN, ret) * FLT_MIN);
-      return ret;
+  double m = z - 0x1.7p+1, i = __builtin_roundeven(m), step = __builtin_copysign(1.0,i);
+  double d = m - i, d2 = d*d, d4 = d2*d2, d8 = d4*d4;
+  double f = (c[0] + d*c[1]) + d2*(c[2] + d*c[3]) + d4*((c[4] + d*c[5]) + d2*(c[6] + d*c[7]))
+    + d8*((c[8] + d*c[9]) + d2*(c[10] + d*c[11]) + d4*((c[12] + d*c[13]) + d2*(c[14] + d*c[15])));
+  int jm = __builtin_fabs(i);
+  double w = 1;
+  if(jm){
+    z -= 0.5 + step*0.5;
+    w = z;
+    for(int j=jm-1; j; j--) {z -= step; w *= z;}
+  }
+  if(i<=-0.5) w = 1/w;
+  f *= w;
+  b64u64_u rt = {.f = f};
+  float r = f;
+  /* Deal with exceptional cases.  */
+  if(__builtin_expect(((rt.u+2)&0xfffffff) < 8, 0)){
+    for(unsigned j=0;j<sizeof(tb)/sizeof(tb[0]);j++) {
+      if(t.u==tb[j].x.u) return tb[j].f + tb[j].df;
     }
-  else
-    return ret;
+  }
+  return r;
 }
 libm_alias_finite (__ieee754_gammaf_r, __gammaf_r)
diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps
index bdfd683454..5668a801c9 100644
--- a/sysdeps/loongarch/lp64/libm-test-ulps
+++ b/sysdeps/loongarch/lp64/libm-test-ulps
@@ -1432,22 +1432,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/m68k/coldfire/fpu/libm-test-ulps b/sysdeps/m68k/coldfire/fpu/libm-test-ulps
index 1b25a70e3f..ae05498719 100644
--- a/sysdeps/m68k/coldfire/fpu/libm-test-ulps
+++ b/sysdeps/m68k/coldfire/fpu/libm-test-ulps
@@ -146,7 +146,6 @@  double: 1
 
 Function: "tgamma":
 double: 1
-float: 1
 
 Function: "y0":
 double: 2
diff --git a/sysdeps/m68k/m680x0/fpu/libm-test-ulps b/sysdeps/m68k/m680x0/fpu/libm-test-ulps
index 6eacfb6b6f..73e98eb479 100644
--- a/sysdeps/m68k/m680x0/fpu/libm-test-ulps
+++ b/sysdeps/m68k/m680x0/fpu/libm-test-ulps
@@ -1208,22 +1208,18 @@  float: 1
 
 Function: "tgamma":
 double: 3
-float: 9
 ldouble: 9
 
 Function: "tgamma_downward":
 double: 3
-float: 9
 ldouble: 9
 
 Function: "tgamma_towardzero":
 double: 3
-float: 9
 ldouble: 9
 
 Function: "tgamma_upward":
 double: 2
-float: 9
 ldouble: 9
 
 Function: "y0":
diff --git a/sysdeps/microblaze/libm-test-ulps b/sysdeps/microblaze/libm-test-ulps
index d3666eb7d4..98ea940ccc 100644
--- a/sysdeps/microblaze/libm-test-ulps
+++ b/sysdeps/microblaze/libm-test-ulps
@@ -257,7 +257,6 @@  float: 2
 
 Function: "tgamma":
 double: 5
-float: 4
 
 Function: "y0":
 double: 2
diff --git a/sysdeps/mips/mips32/libm-test-ulps b/sysdeps/mips/mips32/libm-test-ulps
index 83772d5772..a6480f1dcf 100644
--- a/sysdeps/mips/mips32/libm-test-ulps
+++ b/sysdeps/mips/mips32/libm-test-ulps
@@ -1156,19 +1156,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/mips/mips64/libm-test-ulps b/sysdeps/mips/mips64/libm-test-ulps
index 0addfd6cb4..3c86d96053 100644
--- a/sysdeps/mips/mips64/libm-test-ulps
+++ b/sysdeps/mips/mips64/libm-test-ulps
@@ -1444,22 +1444,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/nios2/libm-test-ulps b/sysdeps/nios2/libm-test-ulps
index c8d1a722f7..df6e932cb6 100644
--- a/sysdeps/nios2/libm-test-ulps
+++ b/sysdeps/nios2/libm-test-ulps
@@ -266,7 +266,6 @@  float: 2
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/or1k/fpu/libm-test-ulps b/sysdeps/or1k/fpu/libm-test-ulps
index 59b9f072f5..4ccb136298 100644
--- a/sysdeps/or1k/fpu/libm-test-ulps
+++ b/sysdeps/or1k/fpu/libm-test-ulps
@@ -1066,19 +1066,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 9
-float: 9
 
 Function: "tgamma_towardzero":
 double: 9
-float: 8
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/or1k/nofpu/libm-test-ulps b/sysdeps/or1k/nofpu/libm-test-ulps
index 726855faaa..7087cf9add 100644
--- a/sysdeps/or1k/nofpu/libm-test-ulps
+++ b/sysdeps/or1k/nofpu/libm-test-ulps
@@ -1064,19 +1064,15 @@  float: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_downward":
 double: 9
-float: 9
 
 Function: "tgamma_towardzero":
 double: 9
-float: 8
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps
index 2e038492cd..dc953bead9 100644
--- a/sysdeps/powerpc/fpu/libm-test-ulps
+++ b/sysdeps/powerpc/fpu/libm-test-ulps
@@ -1828,25 +1828,21 @@  ldouble: 6
 
 Function: "tgamma":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 float128: 5
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
diff --git a/sysdeps/powerpc/nofpu/libm-test-ulps b/sysdeps/powerpc/nofpu/libm-test-ulps
index dc9b499cc4..c604baa245 100644
--- a/sysdeps/powerpc/nofpu/libm-test-ulps
+++ b/sysdeps/powerpc/nofpu/libm-test-ulps
@@ -1560,22 +1560,18 @@  ldouble: 6
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/riscv/nofpu/libm-test-ulps b/sysdeps/riscv/nofpu/libm-test-ulps
index 9ad64d1d85..4adad138b0 100644
--- a/sysdeps/riscv/nofpu/libm-test-ulps
+++ b/sysdeps/riscv/nofpu/libm-test-ulps
@@ -1361,22 +1361,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 5
-float: 5
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 5
-float: 4
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 4
-float: 4
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/riscv/rvd/libm-test-ulps b/sysdeps/riscv/rvd/libm-test-ulps
index 1e6c092361..eb27d4f2f9 100644
--- a/sysdeps/riscv/rvd/libm-test-ulps
+++ b/sysdeps/riscv/rvd/libm-test-ulps
@@ -1431,22 +1431,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 8
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/s390/fpu/libm-test-ulps b/sysdeps/s390/fpu/libm-test-ulps
index 9ac3db4fa5..e6872cf932 100644
--- a/sysdeps/s390/fpu/libm-test-ulps
+++ b/sysdeps/s390/fpu/libm-test-ulps
@@ -1429,22 +1429,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/sh/libm-test-ulps b/sysdeps/sh/libm-test-ulps
index 3c84259941..36f21ed395 100644
--- a/sysdeps/sh/libm-test-ulps
+++ b/sysdeps/sh/libm-test-ulps
@@ -532,11 +532,9 @@  float: 2
 
 Function: "tgamma":
 double: 9
-float: 8
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 
 Function: "y0":
 double: 3
diff --git a/sysdeps/sparc/fpu/libm-test-ulps b/sysdeps/sparc/fpu/libm-test-ulps
index 0142357b3f..ae1e8ee972 100644
--- a/sysdeps/sparc/fpu/libm-test-ulps
+++ b/sysdeps/sparc/fpu/libm-test-ulps
@@ -1444,22 +1444,18 @@  ldouble: 3
 
 Function: "tgamma":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 ldouble: 5
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 ldouble: 4
 
 Function: "y0":
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 37d8998c71..29e3ca926a 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -2263,25 +2263,21 @@  double: 1
 
 Function: "tgamma":
 double: 9
-float: 8
 float128: 4
 ldouble: 5
 
 Function: "tgamma_downward":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_towardzero":
 double: 9
-float: 7
 float128: 5
 ldouble: 6
 
 Function: "tgamma_upward":
 double: 9
-float: 8
 float128: 4
 ldouble: 5