Sparc exp(), expf() performance improvement

Message ID	1501529969-96949-1-git-send-email-patrick.mcgehearty@oracle.com
State	New
Headers	show Return-Path: <libc-alpha-return-82574-incoming=patchwork.ozlabs.org@sourceware.org> DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:content-type; q=dns; s=default; b=Cv4GNxbubucKnl2pgF6skmVPdGcRiuOHaOeGwF1bsB9 0XZ8gO/3V79n5JLWhbKMbVQt52k38a4F/Cm/bswn/jBuIz3VIex1cUDLv5tt9iJI DonOuc4XP/WG0lOa8Msa1zpvsJlMxLpIHG1LF+O7YAf3YTy/esEZoO4LQr/bWuq8 = Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org From: Patrick McGehearty <patrick.mcgehearty@oracle.com> To: libc-alpha@sourceware.org Subject: [PATCH] Sparc exp(), expf() performance improvement Date: Mon, 31 Jul 2017 15:39:29 -0400 Message-Id: <1501529969-96949-1-git-send-email-patrick.mcgehearty@oracle.com> Content-Type: text/x-patch

This PATCH is intended to improve exp() and expf() performance on Sparc. These changes will only be active on Sparc platforms and only for those platforms that support HWCAP_SPARC_CRYPTO (niagara4 and later). Typical Performance Gains for new exp() code for Sparc exp() With -mcpu=niagara4, 8x to 14x (depending on input value) Without -mcpu=niagara4, 5x to 8x (depending on input value) expf() With -mcpu=niagara4, 16x Without -mcpu=niagara4, 15x Some portion of these speedups are due to taking advantage of specific features new to the niagara4 and later architectures. Glibc correctness tests for exp() and expf() were run. Within the test suite 3 input values were found to cause 1 bit differences (ulp) when "FE_TONEAREST" rounding mode is set. No differences were seen for the tested values for the other rounding modes. Typical example: exp(-0x1.760cd2p+0) (-1.46113312244415283203125) new code: 2.31973271630014299393707e-01 0x1.db14cd799387ap-3 old code: 2.31973271630014271638132e-01 0x1.db14cd7993879p-3 exp = 2.31973271630014285508337 (high precision) Old delta: off by 0.49 ulp New delta: off by 0.51 ulp This is a difference of 0.02 ulp which is well within the 1 ulp requirement although the test function reports the difference as a failure. For expf(), no differences in the test suite were seen in FE_TONEAREST, FE_DOWNWARD, FE_TOWARDZERO. Two differences were seen with FE_UPWARD. For both cases, the precise value was less than 0.00005 less than the value reported by the new code. The difference of the old and new was 1 ulp. Typical example: Failure: Test: exp_upward (-0x1p-20) Result: new code: 9.999990463256e-01 0x1.ffffe0p-1 old code: 9.999991059303e-01 0x1.ffffe2p-1 precise 9.999990463261e-01 (high precision) (using bc -l at scale=60) Changes to be committed: (use "git reset HEAD <file>..." to unstage) * sysdeps/sparc/configure.ac: manage -mcpu=niagara4 test * sysdeps/sparc/configure: Regenerate * sysdeps/sparc/fpu/libm_endian.h: New file * sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/Makefile: add e_exp*-niagara4.c functions and -mcpu=niagara4 option * sysdeps/sparc/sparc64/fpu/multiarch/Makefile: Likewise * sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c: New file * sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_expf-niagara4.c: New file * sysdeps/sparc/sparc64/fpu/multiarch/e_exp-niagara4.c: New file * sysdeps/sparc/sparc64/fpu/multiarch/e_expf-niagara4.c: New file --- sysdeps/sparc/configure | 24 + sysdeps/sparc/configure.ac | 12 + sysdeps/sparc/fpu/libm_endian.h | 32 ++ .../sparc/sparc32/sparcv9/fpu/multiarch/Makefile | 9 +- .../sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c | 1 + .../sparcv9/fpu/multiarch/e_expf-niagara4.c | 1 + sysdeps/sparc/sparc64/fpu/multiarch/Makefile | 9 +- .../sparc/sparc64/fpu/multiarch/e_exp-niagara4.c | 433 +++++++++++++++++++ .../sparc/sparc64/fpu/multiarch/e_expf-niagara4.c | 445 ++++++++++++++++++++ 9 files changed, 964 insertions(+), 2 deletions(-) create mode 100644 sysdeps/sparc/fpu/libm_endian.h create mode 100644 sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c create mode 100644 sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_expf-niagara4.c create mode 100644 sysdeps/sparc/sparc64/fpu/multiarch/e_exp-niagara4.c create mode 100644 sysdeps/sparc/sparc64/fpu/multiarch/e_expf-niagara4.c diff --git a/sysdeps/sparc/configure b/sysdeps/sparc/configure index 90a86f6..cf4d567 100644 --- a/sysdeps/sparc/configure +++ b/sysdeps/sparc/configure @@ -81,3 +81,27 @@ if test $libc_cv_sparc_gcc_gotdata = yes; then $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h fi + +# Check for GCC support of -mcpu=niagara4 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for niagara4 compiler support" >&5 +$as_echo_n "checking for niagara4 compiler support... " >&6; } +if ${libc_cv_sparc_niagara4+:} false; then : + $as_echo_n "(cached) " >&6 +else + echo > conftest.S +if { ac_try='${CC-cc} -mcpu=niagara4 conftest.S -S' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + libc_cv_sparc_niagara4=yes +else + libc_cv_sparc_niagara4=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_sparc_niagara4" >&5 +$as_echo "$libc_cv_sparc_niagara4" >&6; } +config_vars="$config_vars +sparc_n4flags = $libc_cv_sparc_niagara4" diff --git a/sysdeps/sparc/configure.ac b/sysdeps/sparc/configure.ac index 982077c..f9f5667 100644 --- a/sysdeps/sparc/configure.ac +++ b/sysdeps/sparc/configure.ac @@ -57,3 +57,15 @@ fi if test $libc_cv_sparc_gcc_gotdata = yes; then AC_DEFINE(PI_STATIC_AND_HIDDEN) fi + +# Check for GCC support of -mcpu=niagara4 +AC_CACHE_CHECK(for niagara4 compiler support, libc_cv_sparc_niagara4, [dnl +echo > conftest.S +dnl +if AC_TRY_COMMAND([${CC-cc} -mcpu=niagara4 conftest.S -S]); then + libc_cv_sparc_niagara4=yes +else + libc_cv_sparc_niagara4=no +fi +rm -f conftest*]) +LIBC_CONFIG_VAR([sparc_n4flags], [$libc_cv_sparc_niagara4]) diff --git a/sysdeps/sparc/fpu/libm_endian.h b/sysdeps/sparc/fpu/libm_endian.h new file mode 100644 index 0000000..a49abcd --- /dev/null +++ b/sysdeps/sparc/fpu/libm_endian.h @@ -0,0 +1,32 @@ +/* + Contributed by Oracle Inc. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _LIBM_ENDIAN_H +#define _LIBM_ENDIAN_H + +#define HIWORD 0 +#define LOWORD 1 +#define HIXWORD 0 /* index of int containing exponent */ +#define XSGNMSK 0x80000000 /* exponent bit mask within the int */ +/* XBIASED_EXP(x) must be an int expression; so ~0x80000000 is no good */ +#define XBIASED_EXP(x) ((((int *)&x)[HIXWORD] & 0x7fffffff) >> 16) +#define ISZEROL(x) (((((int *)&x)[0] & ~XSGNMSK) | ((int *)&x)[1] | \ + ((int *)&x)[2] | ((int *)&x)[3]) == 0) + +#endif /* !defined(_LIBM_ENDIAN_H) */ diff --git a/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/Makefile b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/Makefile index 322e300..2d5bc0b 100644 --- a/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/Makefile +++ b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/Makefile @@ -1,4 +1,5 @@ ifeq ($(subdir),math) +libm-sysdep_routines += e_exp-niagara4 e_expf-niagara4 ifeq ($(have-as-vis3),yes) libm-sysdep_routines += m_copysignf-vis3 m_copysign-vis3 s_fabs-vis3 \ s_fabsf-vis3 s_llrintf-vis3 s_llrint-vis3 \ @@ -7,8 +8,14 @@ libm-sysdep_routines += m_copysignf-vis3 m_copysign-vis3 s_fabs-vis3 \ s_fmaf-vis3 s_fma-vis3 s_nearbyint-vis3 \ s_nearbyintf-vis3 s_fdimf-vis3 s_fdim-vis3 sysdep_routines += s_copysignf-vis3 s_copysign-vis3 - CFLAGS-s_fdimf-vis3.c += -Wa,-Av9d -mvis3 CFLAGS-s_fdim-vis3.c += -Wa,-Av9d -mvis3 endif +ifeq ($(sparc_n4flags),yes) +N4_CFLAGS = -mcpu=niagara4 +else +N4_CFLAGS = +endif +CFLAGS-e_exp-niagara4.c += ${N4_CFLAGS} +CFLAGS-e_expf-niagara4.c += ${N4_CFLAGS} endif diff --git a/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c new file mode 100644 index 0000000..fc29e48 --- /dev/null +++ b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_exp-niagara4.c @@ -0,0 +1 @@ +#include <sparc64/fpu/multiarch/e_exp-niagara4.c> diff --git a/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_expf-niagara4.c b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_expf-niagara4.c new file mode 100644 index 0000000..2cd92c2 --- /dev/null +++ b/sysdeps/sparc/sparc32/sparcv9/fpu/multiarch/e_expf-niagara4.c @@ -0,0 +1 @@ +#include <sparc64/fpu/multiarch/e_expf-niagara4.c> diff --git a/sysdeps/sparc/sparc64/fpu/multiarch/Makefile b/sysdeps/sparc/sparc64/fpu/multiarch/Makefile index 03a271d..6379424 100644 --- a/sysdeps/sparc/sparc64/fpu/multiarch/Makefile +++ b/sysdeps/sparc/sparc64/fpu/multiarch/Makefile @@ -1,4 +1,5 @@ ifeq ($(subdir),math) +libm-sysdep_routines += e_exp-niagara4 e_expf-niagara4 ifeq ($(have-as-vis3),yes) libm-sysdep_routines += m_signbitf-vis3 m_signbit-vis3 m_finitef-vis3 \ m_finite-vis3 m_isinff-vis3 m_isinf-vis3 \ @@ -11,7 +12,6 @@ libm-sysdep_routines += m_signbitf-vis3 m_signbit-vis3 m_finitef-vis3 \ sysdep_routines += s_signbitf-vis3 s_signbit-vis3 s_finitef-vis3 \ s_finite-vis3 s_isinff-vis3 s_isinf-vis3 \ s_isnanf-vis3 s_isnan-vis3 - CFLAGS-s_ceilf-vis3.c += -Wa,-Av9d -mvis3 CFLAGS-s_ceil-vis3.c += -Wa,-Av9d -mvis3 CFLAGS-s_floorf-vis3.c += -Wa,-Av9d -mvis3 @@ -19,4 +19,11 @@ CFLAGS-s_floor-vis3.c += -Wa,-Av9d -mvis3 CFLAGS-s_truncf-vis3.c += -Wa,-Av9d -mvis3 CFLAGS-s_trunc-vis3.c += -Wa,-Av9d -mvis3 endif +ifeq ($(sparc_n4flags),yes) +N4_CFLAGS = -mcpu=niagara4 +else +N4_CFLAGS = +endif +CFLAGS-e_exp-niagara4.c += ${N4_CFLAGS} +CFLAGS-e_expf-niagara4.c += ${N4_CFLAGS} endif diff --git a/sysdeps/sparc/sparc64/fpu/multiarch/e_exp-niagara4.c b/sysdeps/sparc/sparc64/fpu/multiarch/e_exp-niagara4.c new file mode 100644 index 0000000..a179668 --- /dev/null +++ b/sysdeps/sparc/sparc64/fpu/multiarch/e_exp-niagara4.c @@ -0,0 +1,433 @@ +/* EXP function + Contributed by Oracle Inc. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* + * exp(x) + * Hybrid algorithm of Peter Tang's Table driven method (for large + * arguments) and an accurate table (for small arguments). + * Written by K.C. Ng, November 1988. + * Method (large arguments): + * 1. Argument Reduction: given the input x, find r and integer k + * and j such that + * x = (k+j/32)*(ln2) + r, |r| <= (1/64)*ln2 + * + * 2. exp(x) = 2^k * (2^(j/32) + 2^(j/32)*expm1(r)) + * a. expm1(r) is approximated by a polynomial: + * expm1(r) ~ r + t1*r^2 + t2*r^3 + ... + t5*r^6 + * Here t1 = 1/2 exactly. + * b. 2^(j/32) is represented to twice double precision + * as TBL[2j]+TBL[2j+1]. + * + * Note: If divide were fast enough, we could use another approximation + * in 2.a: + * expm1(r) ~ (2r)/(2-R), R = r - r^2*(t1 + t2*r^2) + * (for the same t1 and t2 as above) + * + * Special cases: + * exp(INF) is INF, exp(NaN) is NaN; + * exp(-INF)= 0; + * for finite argument, only exp(0)=1 is exact. + * + * Accuracy: + * According to an error analysis, the error is always less than + * an ulp (unit in the last place). The largest errors observed + * are less than 0.55 ulp for normal results and less than 0.75 ulp + * for subnormal results. + * + * Misc. info. + * For IEEE double + * if x > 7.09782712893383973096e+02 then exp(x) overflow + * if x < -7.45133219101941108420e+02 then exp(x) underflow + */ + +/* #pragma weak exp = __exp_niagara4 */ +#include <sparc-ifunc.h> +#include "libm_endian.h" +#include <math.h> +#include <math_private.h> +#include <errno.h> + +extern double exp (double); +extern double __exp_niagara4 (double); +extern double __ieee754_exp (double); + +sparc_libm_ifunc (exp, hwcap & HWCAP_SPARC_CRYPTO ? + __exp_niagara4 : __ieee754_exp); + + +static const double TBL[] = { + 1.00000000000000000000e+00, 0.00000000000000000000e+00, + 1.02189714865411662714e+00, 5.10922502897344389359e-17, + 1.04427378242741375480e+00, 8.55188970553796365958e-17, + 1.06714040067682369717e+00, -7.89985396684158212226e-17, + 1.09050773266525768967e+00, -3.04678207981247114697e-17, + 1.11438674259589243221e+00, 1.04102784568455709549e-16, + 1.13878863475669156458e+00, 8.91281267602540777782e-17, + 1.16372485877757747552e+00, 3.82920483692409349872e-17, + 1.18920711500272102690e+00, 3.98201523146564611098e-17, + 1.21524735998046895524e+00, -7.71263069268148813091e-17, + 1.24185781207348400201e+00, 4.65802759183693679123e-17, + 1.26905095719173321989e+00, 2.66793213134218609523e-18, + 1.29683955465100964055e+00, 2.53825027948883149593e-17, + 1.32523664315974132322e+00, -2.85873121003886075697e-17, + 1.35425554693689265129e+00, 7.70094837980298946162e-17, + 1.38390988196383202258e+00, -6.77051165879478628716e-17, + 1.41421356237309514547e+00, -9.66729331345291345105e-17, + 1.44518080697704665027e+00, -3.02375813499398731940e-17, + 1.47682614593949934623e+00, -3.48399455689279579579e-17, + 1.50916442759342284141e+00, -1.01645532775429503911e-16, + 1.54221082540794074411e+00, 7.94983480969762085616e-17, + 1.57598084510788649659e+00, -1.01369164712783039808e-17, + 1.61049033194925428347e+00, 2.47071925697978878522e-17, + 1.64575547815396494578e+00, -1.01256799136747726038e-16, + 1.68179283050742900407e+00, 8.19901002058149652013e-17, + 1.71861929812247793414e+00, -1.85138041826311098821e-17, + 1.75625216037329945351e+00, 2.96014069544887330703e-17, + 1.79470907500310716820e+00, 1.82274584279120867698e-17, + 1.83400808640934243066e+00, 3.28310722424562658722e-17, + 1.87416763411029996256e+00, -6.12276341300414256164e-17, + 1.91520656139714740007e+00, -1.06199460561959626376e-16, + 1.95714412417540017941e+00, 8.96076779103666776760e-17, +}; + +/* + * For i = 0, ..., 66, + * TBL2[2*i] is a double precision number near (i+1)*2^-6, and + * TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less + * than 2^-60. + * + * For i = 67, ..., 133, + * TBL2[2*i] is a double precision number near -(i+1)*2^-6, and + * TBL2[2*i+1] = exp(TBL2[2*i]) to within a relative error less + * than 2^-60. + */ +static const double TBL2[] = { + 1.56249999999984491572e-02, 1.01574770858668417262e+00, + 3.12499999999998716305e-02, 1.03174340749910253834e+00, + 4.68750000000011102230e-02, 1.04799100201663386578e+00, + 6.24999999999990632493e-02, 1.06449445891785843266e+00, + 7.81249999999999444888e-02, 1.08125780744903954300e+00, + 9.37500000000013322676e-02, 1.09828514030782731226e+00, + 1.09375000000001346145e-01, 1.11558061464248226002e+00, + 1.24999999999999417133e-01, 1.13314845306682565607e+00, + 1.40624999999995337063e-01, 1.15099294469117108264e+00, + 1.56249999999996141975e-01, 1.16911844616949989195e+00, + 1.71874999999992894573e-01, 1.18752938276309216725e+00, + 1.87500000000000888178e-01, 1.20623024942098178158e+00, + 2.03124999999361649516e-01, 1.22522561187652545556e+00, + 2.18750000000000416334e-01, 1.24452010776609567344e+00, + 2.34375000000003524958e-01, 1.26411844775347081971e+00, + 2.50000000000006328271e-01, 1.28402541668774961003e+00, + 2.65624999999982791543e-01, 1.30424587476761533189e+00, + 2.81249999999993727240e-01, 1.32478475872885725906e+00, + 2.96875000000003275158e-01, 1.34564708304941493822e+00, + 3.12500000000002886580e-01, 1.36683794117380030819e+00, + 3.28124999999993394173e-01, 1.38836250675661765364e+00, + 3.43749999999998612221e-01, 1.41022603492570874906e+00, + 3.59374999999992450483e-01, 1.43243386356506730017e+00, + 3.74999999999991395772e-01, 1.45499141461818881638e+00, + 3.90624999999997613020e-01, 1.47790419541173490003e+00, + 4.06249999999991895372e-01, 1.50117780000011058483e+00, + 4.21874999999996613820e-01, 1.52481791053132154090e+00, + 4.37500000000004607426e-01, 1.54883029863414023453e+00, + 4.53125000000004274359e-01, 1.57322082682725961078e+00, + 4.68750000000008326673e-01, 1.59799544995064657371e+00, + 4.84374999999985456078e-01, 1.62316021661928200359e+00, + 4.99999999999997335465e-01, 1.64872127070012375327e+00, + 5.15625000000000222045e-01, 1.67468485281178436352e+00, + 5.31250000000003441691e-01, 1.70105730184840653330e+00, + 5.46874999999999111822e-01, 1.72784505652716169344e+00, + 5.62499999999999333866e-01, 1.75505465696029738787e+00, + 5.78124999999993338662e-01, 1.78269274625180318417e+00, + 5.93749999999999666933e-01, 1.81076607211938656050e+00, + 6.09375000000003441691e-01, 1.83928148854178719063e+00, + 6.24999999999995559108e-01, 1.86824595743221411048e+00, + 6.40625000000009103829e-01, 1.89766655033813602671e+00, + 6.56249999999993782751e-01, 1.92755045016753268072e+00, + 6.71875000000002109424e-01, 1.95790495294292221651e+00, + 6.87499999999992450483e-01, 1.98873746958227681780e+00, + 7.03125000000004996004e-01, 2.02005552770870666635e+00, + 7.18750000000007105427e-01, 2.05186677348799140219e+00, + 7.34375000000008770762e-01, 2.08417897349558689513e+00, + 7.49999999999983901766e-01, 2.11700001661264058939e+00, + 7.65624999999997002398e-01, 2.15033791595229351046e+00, + 7.81250000000005884182e-01, 2.18420081081563077774e+00, + 7.96874999999991451283e-01, 2.21859696867912603579e+00, + 8.12500000000000000000e-01, 2.25353478721320854561e+00, + 8.28125000000008215650e-01, 2.28902279633221983346e+00, + 8.43749999999997890576e-01, 2.32506966027711614586e+00, + 8.59374999999999444888e-01, 2.36168417973090827289e+00, + 8.75000000000003219647e-01, 2.39887529396710563745e+00, + 8.90625000000013433699e-01, 2.43665208303232461162e+00, + 9.06249999999980571097e-01, 2.47502376996297712708e+00, + 9.21874999999984456878e-01, 2.51399972303748420188e+00, + 9.37500000000001887379e-01, 2.55358945806293169412e+00, + 9.53125000000003330669e-01, 2.59380264069854327147e+00, + 9.68749999999989119814e-01, 2.63464908881560244680e+00, + 9.84374999999997890576e-01, 2.67613877489447116176e+00, + 1.00000000000001154632e+00, 2.71828182845907662113e+00, + 1.01562499999999333866e+00, 2.76108853855008318234e+00, + 1.03124999999995980993e+00, 2.80456935623711389738e+00, + 1.04687499999999933387e+00, 2.84873489717039740654e+00, + -1.56249999999999514277e-02, 9.84496437005408453480e-01, + -3.12499999999955972718e-02, 9.69233234476348348707e-01, + -4.68749999999993824384e-02, 9.54206665969188905230e-01, + -6.24999999999976130205e-02, 9.39413062813478028090e-01, + -7.81249999999989314103e-02, 9.24848813216205822840e-01, + -9.37499999999995975442e-02, 9.10510361380034494161e-01, + -1.09374999999998584466e-01, 8.96394206635151680196e-01, + -1.24999999999998556710e-01, 8.82496902584596676355e-01, + -1.40624999999999361622e-01, 8.68815056262843721235e-01, + -1.56249999999999111822e-01, 8.55345327307423297647e-01, + -1.71874999999924144012e-01, 8.42084427143446223596e-01, + -1.87499999999996752598e-01, 8.29029118180403035154e-01, + -2.03124999999988037347e-01, 8.16176213022349550386e-01, + -2.18749999999995947686e-01, 8.03522573689063990265e-01, + -2.34374999999996419531e-01, 7.91065110850298847112e-01, + -2.49999999999996280753e-01, 7.78800783071407765057e-01, + -2.65624999999999888978e-01, 7.66726596070820165529e-01, + -2.81249999999989397370e-01, 7.54839601989015340777e-01, + -2.96874999999996114219e-01, 7.43136898668761203268e-01, + -3.12499999999999555911e-01, 7.31615628946642115871e-01, + -3.28124999999993782751e-01, 7.20272979955444259126e-01, + -3.43749999999997946087e-01, 7.09106182437399867879e-01, + -3.59374999999994337863e-01, 6.98112510068129799023e-01, + -3.74999999999994615418e-01, 6.87289278790975899369e-01, + -3.90624999999999000799e-01, 6.76633846161729612945e-01, + -4.06249999999947264406e-01, 6.66143610703522903727e-01, + -4.21874999999988453681e-01, 6.55816011271509125002e-01, + -4.37499999999999111822e-01, 6.45648526427892610613e-01, + -4.53124999999999278355e-01, 6.35638673826052436056e-01, + -4.68749999999999278355e-01, 6.25784009604591573428e-01, + -4.84374999999992894573e-01, 6.16082127790682609891e-01, + -4.99999999999998168132e-01, 6.06530659712634534486e-01, + -5.15625000000000000000e-01, 5.97127273421627413619e-01, + -5.31249999999989785948e-01, 5.87869673122352498496e-01, + -5.46874999999972688514e-01, 5.78755598612500032907e-01, + -5.62500000000000000000e-01, 5.69782824730923009859e-01, + -5.78124999999992339461e-01, 5.60949160814475100700e-01, + -5.93749999999948707696e-01, 5.52252450163048691500e-01, + -6.09374999999552580121e-01, 5.43690569513243682209e-01, + -6.24999999999984789945e-01, 5.35261428518998383375e-01, + -6.40624999999983457677e-01, 5.26962969243379708573e-01, + -6.56249999999998334665e-01, 5.18793165653890220312e-01, + -6.71874999999943378626e-01, 5.10750023129039609771e-01, + -6.87499999999997002398e-01, 5.02831577970942467104e-01, + -7.03124999999991118216e-01, 4.95035896926202978463e-01, + -7.18749999999991340260e-01, 4.87361076713623331269e-01, + -7.34374999999985678123e-01, 4.79805243559684402310e-01, + -7.49999999999997335465e-01, 4.72366552741015965911e-01, + -7.65624999999993782751e-01, 4.65043188134059204408e-01, + -7.81249999999863220523e-01, 4.57833361771676883301e-01, + -7.96874999999998112621e-01, 4.50735313406363247157e-01, + -8.12499999999990119015e-01, 4.43747310081084256339e-01, + -8.28124999999996003197e-01, 4.36867645705559026759e-01, + -8.43749999999988120614e-01, 4.30094640640067360504e-01, + -8.59374999999994115818e-01, 4.23426641285265303871e-01, + -8.74999999999977129406e-01, 4.16862019678517936594e-01, + -8.90624999999983346655e-01, 4.10399173096376801428e-01, + -9.06249999999991784350e-01, 4.04036523663345414903e-01, + -9.21874999999994004796e-01, 3.97772517966614058693e-01, + -9.37499999999994337863e-01, 3.91605626676801210628e-01, + -9.53124999999999444888e-01, 3.85534344174578935682e-01, + -9.68749999999986677324e-01, 3.79557188183094640355e-01, + -9.84374999999992339461e-01, 3.73672699406045860648e-01, + -9.99999999999995892175e-01, 3.67879441171443832825e-01, + -1.01562499999994315658e+00, 3.62175999080846300338e-01, + -1.03124999999991096011e+00, 3.56560980663978732697e-01, + -1.04687499999999067413e+00, 3.51033015038813400732e-01, +}; + +static const double zC[] = { + 0.5, + 4.61662413084468283841e+01, /* 0x40471547, 0x652b82fe */ + 2.16608493865351192653e-02, /* 0x3f962e42, 0xfee00000 */ + 5.96317165397058656257e-12, /* 0x3d9a39ef, 0x35793c76 */ + 1.6666666666526086527e-1, /* 3fc5555555548f7c */ + 4.1666666666226079285e-2, /* 3fa5555555545d4e */ + 8.3333679843421958056e-3, /* 3f811115b7aa905e */ + 1.3888949086377719040e-3, /* 3f56c1728d739765 */ + 1.0, + 0.0, + 7.09782712893383973096e+02, /* 0x40862E42, 0xFEFA39EF */ + 7.45133219101941108420e+02, /* 0x40874910, 0xD52D3051 */ + 5.55111512312578270212e-17, /* 0x3c900000, 0x00000000 */ + 1.0e-100, + 1.0e-300, + 1.0e300 +}; + +#ifndef DBL_MIN +#define DBL_MIN 2.2250738585072014E-308 +#endif + +#define half zC[0] +#define invln2_32 zC[1] +#define ln2_32hi zC[2] +#define ln2_32lo zC[3] +#define t2 zC[4] +#define t3 zC[5] +#define t4 zC[6] +#define t5 zC[7] +#define one zC[8] +#define zero zC[9] +#define threshold1 zC[10] +#define threshold2 zC[11] +#define twom54 zC[12] +#define small zC[13] +#define tiny zC[14] +#define hhuge zC[15] + + +void +__set_err_exp (double x, double result) +{ + if (_LIB_VERSION != _IEEE_) + { + if (_LIB_VERSION == _POSIX_) + { + __set_errno (ERANGE); + } + else + { + struct exception exc; + exc.arg1 = x; + exc.type = DOMAIN; + exc.name = (char *) "exp"; + exc.retval = result; + if (!matherr (&exc)) + { + __set_errno (ERANGE); + } + } + } +} + + +double +__exp_niagara4 (double x_arg) +{ + double z, t; + double retval; + int hx, ix, k, j, m; + union + { + int i_part[2]; + double x; + } xx; + union + { + int y_part[2]; + double y; + } yy; + xx.x = x_arg; + + ix = xx.i_part[HIWORD]; + hx = ix & ~0x80000000; + + if (hx < 0x3ff0a2b2) + { /* |x| < 3/2 ln 2 */ + if (hx < 0x3f862e42) + { /* |x| < 1/64 ln 2 */ + if (hx < 0x3ed00000) + { /* |x| < 2^-18 */ + /* raise inexact if x != 0 */ + if (hx < 0x3e300000) + { + retval = one + xx.x; + return (retval); + } + retval = one + xx.x * (one + half * xx.x); + return (retval); + } + t = xx.x * xx.x; + yy.y = xx.x + (t * (half + xx.x * t2) + + (t * t) * (t3 + xx.x * t4 + t * t5)); + retval = one + yy.y; + return (retval); + } + + /* find the multiple of 2^-6 nearest x */ + k = hx >> 20; + j = (0x00100000 | (hx & 0x000fffff)) >> (0x40c - k); + j = (j - 1) & ~1; + if (ix < 0) + j += 134; + z = xx.x - TBL2[j]; + t = z * z; + /* the "small" term below guarantees inexact will be raised */ + yy.y = z + (t * (half + (z * t2 + small)) + + (t * t) * (t3 + z * t4 + t * t5)); + retval = TBL2[j + 1] + TBL2[j + 1] * yy.y; + return (retval); + } + + if (hx >= 0x40862e42) + { /* x is large, infinite, or nan */ + if (hx >= 0x7ff00000) + { + if (ix == 0xfff00000 && xx.i_part[LOWORD] == 0) + return (zero); /* exp(-inf) = 0 */ + return (xx.x * xx.x); /* exp(nan/inf) is nan or inf */ + } + if (xx.x > threshold1) + { /* set overflow error condition */ + retval = hhuge * hhuge; + __set_err_exp (xx.x, retval); + return retval; + } + if (-xx.x > threshold2) + { /* set underflow error condition */ + double force_underflow = tiny * tiny; + math_force_eval (force_underflow); + retval = zero; + __set_err_exp (xx.x, retval); + return retval; + } + } + + t = invln2_32 * xx.x; + if (ix < 0) + t -= half; + else + t += half; + k = (int) t; + j = (k & 0x1f) << 1; + m = k >> 5; + z = (xx.x - k * ln2_32hi) - k * ln2_32lo; + + /* z is now in primary range */ + t = z * z; + yy.y = z + (t * (half + z * t2) + (t * t) * (t3 + z * t4 + t * t5)); + yy.y = TBL[j] + (TBL[j + 1] + TBL[j] * yy.y); + if (m < -1021) + { + yy.y_part[HIWORD] += (m + 54) << 20; + retval = twom54 * yy.y; + if (retval < DBL_MIN) + { + double force_underflow = tiny * tiny; + math_force_eval (force_underflow); + __set_err_exp (xx.x, retval); + } + return (retval); + } + yy.y_part[HIWORD] += m << 20; + return (yy.y); +} diff --git a/sysdeps/sparc/sparc64/fpu/multiarch/e_expf-niagara4.c b/sysdeps/sparc/sparc64/fpu/multiarch/e_expf-niagara4.c new file mode 100644 index 0000000..ba0b1e7 --- /dev/null +++ b/sysdeps/sparc/sparc64/fpu/multiarch/e_expf-niagara4.c @@ -0,0 +1,445 @@ +/* EXPF function + Contributed by Oracle Inc. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +/* INDENT OFF */ +/* + * float expf(float x); + * Code by K.C. Ng for SUN 5.0 libmopt + * 11/5/99 + * Method : + * 1. For |x| >= 2^7, either underflow/overflow. + * More precisely: + * x > 88.722839355...(0x42B17218) => overflow; + * x < -103.97207642..(0xc2CFF1B4) => underflow. + * 2. For |x| < 2^-6, use polynomail + * exp(x) = 1 + x + p1*x^2 + p2*x^3 + * 3. Otherwise, write |x|=(1+r)*2^n, where 0<=r<1. + * Let t = 2^n * (1+r) .... x > 0; + * t = 2^n * (1-r) .... x < 0. (x= -2**(n+1)+t) + * Since -6 <= n <= 6, we may break t into + * six 6-bits chunks: + * -5 -11 -17 -23 -29 + * t=j *2+j *2 +j *2 +j *2 +j *2 +j *2 + * 1 2 3 4 5 6 + * + * where 0 <= j < 64 for i = 1,...,6. + * i + * Note that since t has only 24 significant bits, + * either j or j must be 0. + * 1 6 + * 7-6i + * One may define j by (int) ( t * 2 ) mod 64 + * i + * mathematically. In actual implementation, they can + * be obtained by manipulating the exponent and + * mantissa bits as follow: + * Let ix = (HEX(x)&0x007fffff)|0x00800000. + * If n>=0, let ix=ix<<n, then j =0 and + * 6 + * j = ix>>(30-6i)) mod 64 ...i=1,...,5 + * i + * Otherwise, let ix=ix<<(j+6), then j = 0 and + * 1 + * j = ix>>(36-6i)) mod 64 ...i=2,...,6 + * i + * + * 4. Compute exp(t) by table look-up method. + * Precompute ET[k] = exp(j*2^(7-6i)), k=j+64*(6-i). + * Then + * exp(t) = ET[j +320]*ET[j +256]*ET[j +192]* + * 1 2 3 + * + * ET[j +128]*ET[j +64]*ET[j ] + * 4 5 6 + * + * n+1 + * 5. If x < 0, return exp(-2 )* exp(t). Note that + * -6 <= n <= 6. Let k = n - 6, then we can + * precompute + * k-5 n+1 + * EN[k] = exp(-2 ) = exp(-2 ) for k=0,1,...,12. + * + * + * Special cases: + * exp(INF) is INF, exp(NaN) is NaN; + * exp(-INF) = 0; + * for finite argument, only exp(0) = 1 is exact. + * + * Accuracy: + * All calculations are done in double precision except for + * the case |x| < 2^-6. When |x| < 2^-6, the error is less + * than 0.55 ulp. When |x| >= 2^-6, the error is less than + * 0.51 ulp. + */ +/* INDENT ON */ + +/* #pragma weak expf = __expf_niagara4 */ +#include <sparc-ifunc.h> +#include "libm_endian.h" +#include <math.h> +#include <errno.h> + +extern float expf (float); +extern float __expf_niagara4 (float); +extern float __ieee754_expf (float); + +sparc_libm_ifunc (expf, hwcap & HWCAP_SPARC_CRYPTO ? + __expf_niagara4 : __ieee754_expf); + + +void +__set_err_expf (float x, float result) +{ + if (_LIB_VERSION != _IEEE_) + { + if (_LIB_VERSION == _POSIX_) + { + __set_errno (ERANGE); + } + else + { + struct exception exc; + exc.arg1 = x; + exc.type = DOMAIN; + exc.name = (char *) "exp"; + exc.retval = result; + if (!matherr (&exc)) + { + __set_errno (ERANGE); + } + } + } +} + + +/* + * ET[k] = exp(j*2^(7-6i)) , where j = k mod 64, i = k/64 + */ +static const double ET[] = { + 1.00000000000000000000e+00, 1.00000000186264514923e+00, + 1.00000000372529029846e+00, 1.00000000558793544769e+00, + 1.00000000745058059692e+00, 1.00000000931322574615e+00, + 1.00000001117587089539e+00, 1.00000001303851604462e+00, + 1.00000001490116119385e+00, 1.00000001676380656512e+00, + 1.00000001862645171435e+00, 1.00000002048909686359e+00, + 1.00000002235174201282e+00, 1.00000002421438716205e+00, + 1.00000002607703253332e+00, 1.00000002793967768255e+00, + 1.00000002980232283178e+00, 1.00000003166496798102e+00, + 1.00000003352761335229e+00, 1.00000003539025850152e+00, + 1.00000003725290365075e+00, 1.00000003911554879998e+00, + 1.00000004097819417126e+00, 1.00000004284083932049e+00, + 1.00000004470348446972e+00, 1.00000004656612984100e+00, + 1.00000004842877499023e+00, 1.00000005029142036150e+00, + 1.00000005215406551073e+00, 1.00000005401671088201e+00, + 1.00000005587935603124e+00, 1.00000005774200140252e+00, + 1.00000005960464655175e+00, 1.00000006146729192302e+00, + 1.00000006332993707225e+00, 1.00000006519258244353e+00, + 1.00000006705522759276e+00, 1.00000006891787296404e+00, + 1.00000007078051811327e+00, 1.00000007264316348454e+00, + 1.00000007450580863377e+00, 1.00000007636845400505e+00, + 1.00000007823109937632e+00, 1.00000008009374452556e+00, + 1.00000008195638989683e+00, 1.00000008381903526811e+00, + 1.00000008568168063938e+00, 1.00000008754432578861e+00, + 1.00000008940697115989e+00, 1.00000009126961653116e+00, + 1.00000009313226190244e+00, 1.00000009499490705167e+00, + 1.00000009685755242295e+00, 1.00000009872019779422e+00, + 1.00000010058284316550e+00, 1.00000010244548853677e+00, + 1.00000010430813368600e+00, 1.00000010617077905728e+00, + 1.00000010803342442856e+00, 1.00000010989606979983e+00, + 1.00000011175871517111e+00, 1.00000011362136054238e+00, + 1.00000011548400591366e+00, 1.00000011734665128493e+00, + 1.00000000000000000000e+00, 1.00000011920929665621e+00, + 1.00000023841860752327e+00, 1.00000035762793260119e+00, + 1.00000047683727188996e+00, 1.00000059604662538959e+00, + 1.00000071525599310007e+00, 1.00000083446537502141e+00, + 1.00000095367477115360e+00, 1.00000107288418149665e+00, + 1.00000119209360605055e+00, 1.00000131130304481530e+00, + 1.00000143051249779091e+00, 1.00000154972196497738e+00, + 1.00000166893144637470e+00, 1.00000178814094198287e+00, + 1.00000190735045180190e+00, 1.00000202655997583179e+00, + 1.00000214576951407253e+00, 1.00000226497906652412e+00, + 1.00000238418863318657e+00, 1.00000250339821405987e+00, + 1.00000262260780914403e+00, 1.00000274181741843904e+00, + 1.00000286102704194491e+00, 1.00000298023667966163e+00, + 1.00000309944633158921e+00, 1.00000321865599772764e+00, + 1.00000333786567807692e+00, 1.00000345707537263706e+00, + 1.00000357628508140806e+00, 1.00000369549480438991e+00, + 1.00000381470454158261e+00, 1.00000393391429298617e+00, + 1.00000405312405860059e+00, 1.00000417233383842586e+00, + 1.00000429154363246198e+00, 1.00000441075344070896e+00, + 1.00000452996326316679e+00, 1.00000464917309983548e+00, + 1.00000476838295071502e+00, 1.00000488759281580542e+00, + 1.00000500680269510667e+00, 1.00000512601258861878e+00, + 1.00000524522249634174e+00, 1.00000536443241827556e+00, + 1.00000548364235442023e+00, 1.00000560285230477575e+00, + 1.00000572206226934213e+00, 1.00000584127224811937e+00, + 1.00000596048224110746e+00, 1.00000607969224830640e+00, + 1.00000619890226971620e+00, 1.00000631811230533685e+00, + 1.00000643732235516836e+00, 1.00000655653241921073e+00, + 1.00000667574249746394e+00, 1.00000679495258992802e+00, + 1.00000691416269660294e+00, 1.00000703337281748873e+00, + 1.00000715258295258536e+00, 1.00000727179310189285e+00, + 1.00000739100326541120e+00, 1.00000751021344314040e+00, + 1.00000000000000000000e+00, 1.00000762942363508046e+00, + 1.00001525890547848796e+00, 1.00002288844553022251e+00, + 1.00003051804379095024e+00, 1.00003814770026133729e+00, + 1.00004577741494138365e+00, 1.00005340718783175546e+00, + 1.00006103701893311886e+00, 1.00006866690824547383e+00, + 1.00007629685576948653e+00, 1.00008392686150582307e+00, + 1.00009155692545448346e+00, 1.00009918704761613384e+00, + 1.00010681722799144033e+00, 1.00011444746658040295e+00, + 1.00012207776338368781e+00, 1.00012970811840196106e+00, + 1.00013733853163522269e+00, 1.00014496900308413885e+00, + 1.00015259953274937565e+00, 1.00016023012063093311e+00, + 1.00016786076672947736e+00, 1.00017549147104567453e+00, + 1.00018312223357952462e+00, 1.00019075305433191581e+00, + 1.00019838393330284809e+00, 1.00020601487049298761e+00, + 1.00021364586590300050e+00, 1.00022127691953288675e+00, + 1.00022890803138353455e+00, 1.00023653920145494389e+00, + 1.00024417042974778091e+00, 1.00025180171626271175e+00, + 1.00025943306099973640e+00, 1.00026706446395974304e+00, + 1.00027469592514273167e+00, 1.00028232744454959047e+00, + 1.00028995902218031944e+00, 1.00029759065803558471e+00, + 1.00030522235211605242e+00, 1.00031285410442172257e+00, + 1.00032048591495348333e+00, 1.00032811778371155675e+00, + 1.00033574971069616488e+00, 1.00034338169590819589e+00, + 1.00035101373934764979e+00, 1.00035864584101541475e+00, + 1.00036627800091149076e+00, 1.00037391021903676602e+00, + 1.00038154249539146257e+00, 1.00038917482997580244e+00, + 1.00039680722279067382e+00, 1.00040443967383629875e+00, + 1.00041207218311289928e+00, 1.00041970475062136359e+00, + 1.00042733737636191371e+00, 1.00043497006033499375e+00, + 1.00044260280254104778e+00, 1.00045023560298029786e+00, + 1.00045786846165363215e+00, 1.00046550137856127272e+00, + 1.00047313435370366363e+00, 1.00048076738708124900e+00, + 1.00000000000000000000e+00, 1.00048840047869447289e+00, + 1.00097703949241645383e+00, 1.00146591715766675179e+00, + 1.00195503359100279717e+00, 1.00244438890903908579e+00, + 1.00293398322844673487e+00, 1.00342381666595459322e+00, + 1.00391388933834746489e+00, 1.00440420136246855165e+00, + 1.00489475285521656645e+00, 1.00538554393354861993e+00, + 1.00587657471447822211e+00, 1.00636784531507639251e+00, + 1.00685935585247099411e+00, 1.00735110644384739942e+00, + 1.00784309720644804642e+00, 1.00833532825757243856e+00, + 1.00882779971457803292e+00, 1.00932051169487890796e+00, + 1.00981346431594687374e+00, 1.01030665769531102782e+00, + 1.01080009195055753324e+00, 1.01129376719933050666e+00, + 1.01178768355933157430e+00, 1.01228184114831898377e+00, + 1.01277624008410960244e+00, 1.01327088048457714109e+00, + 1.01376576246765282008e+00, 1.01426088615132625748e+00, + 1.01475625165364347069e+00, 1.01525185909270931894e+00, + 1.01574770858668572693e+00, 1.01624380025379235093e+00, + 1.01674013421230657883e+00, 1.01723671058056375216e+00, + 1.01773352947695694404e+00, 1.01823059101993673714e+00, + 1.01872789532801233392e+00, 1.01922544251975000229e+00, + 1.01972323271377418585e+00, 1.02022126602876750390e+00, + 1.02071954258347008526e+00, 1.02121806249668067856e+00, + 1.02171682588725554197e+00, 1.02221583287410910934e+00, + 1.02271508357621376817e+00, 1.02321457811260052573e+00, + 1.02371431660235789884e+00, 1.02421429916463280207e+00, + 1.02471452591863054771e+00, 1.02521499698361440167e+00, + 1.02571571247890602763e+00, 1.02621667252388526492e+00, + 1.02671787723799012859e+00, 1.02721932674071725344e+00, + 1.02772102115162167202e+00, 1.02822296059031659254e+00, + 1.02872514517647339893e+00, 1.02922757502982276101e+00, + 1.02973025027015285815e+00, 1.03023317101731093359e+00, + 1.03073633739120262831e+00, 1.03123974951179242510e+00, + 1.00000000000000000000e+00, 1.03174340749910276038e+00, + 1.06449445891785954288e+00, 1.09828514030782575794e+00, + 1.13314845306682632220e+00, 1.16911844616950433284e+00, + 1.20623024942098067136e+00, 1.24452010776609522935e+00, + 1.28402541668774139438e+00, 1.32478475872886569675e+00, + 1.36683794117379631139e+00, 1.41022603492571074746e+00, + 1.45499141461820125087e+00, 1.50117780000012279729e+00, + 1.54883029863413312910e+00, 1.59799544995063325104e+00, + 1.64872127070012819416e+00, 1.70105730184840076014e+00, + 1.75505465696029849809e+00, 1.81076607211938722664e+00, + 1.86824595743222232613e+00, 1.92755045016754467113e+00, + 1.98873746958229191684e+00, 2.05186677348797674725e+00, + 2.11700001661267478426e+00, 2.18420081081561789915e+00, + 2.25353478721320854561e+00, 2.32506966027712103084e+00, + 2.39887529396709808793e+00, 2.47502376996302508871e+00, + 2.55358945806292680913e+00, 2.63464908881563086851e+00, + 2.71828182845904553488e+00, 2.80456935623722669604e+00, + 2.89359594417176113623e+00, 2.98544853936535581340e+00, + 3.08021684891803104733e+00, 3.17799342753883840018e+00, + 3.27887376793867346692e+00, 3.38295639409246895468e+00, + 3.49034295746184142217e+00, 3.60113833627217561073e+00, + 3.71545073794110392029e+00, 3.83339180475841034834e+00, + 3.95507672292057721464e+00, 4.08062433502646015882e+00, + 4.21015725614395996956e+00, 4.34380199356104235164e+00, + 4.48168907033806451778e+00, 4.62395315278208052234e+00, + 4.77073318196760265408e+00, 4.92217250943229078786e+00, + 5.07841903718008147450e+00, 5.23962536212848917216e+00, + 5.40594892514116676097e+00, 5.57755216479125959239e+00, + 5.75460267600573072144e+00, 5.93727337374560715233e+00, + 6.12574266188198635064e+00, 6.32019460743274397174e+00, + 6.52081912033011246166e+00, 6.72781213889469142941e+00, + 6.94137582119703555605e+00, 7.16171874249371143151e+00, + 1.00000000000000000000e+00, 7.38905609893065040694e+00, + 5.45981500331442362040e+01, 4.03428793492735110249e+02, + 2.98095798704172830185e+03, 2.20264657948067178950e+04, + 1.62754791419003915507e+05, 1.20260428416477679275e+06, + 8.88611052050787210464e+06, 6.56599691373305097222e+07, + 4.85165195409790277481e+08, 3.58491284613159179688e+09, + 2.64891221298434715271e+10, 1.95729609428838775635e+11, + 1.44625706429147509766e+12, 1.06864745815244628906e+13, + 7.89629601826806875000e+13, 5.83461742527454875000e+14, + 4.31123154711519500000e+15, 3.18559317571137560000e+16, + 2.35385266837020000000e+17, 1.73927494152050099200e+18, + 1.28516001143593082880e+19, 9.49611942060244828160e+19, + 7.01673591209763143680e+20, 5.18470552858707204506e+21, + 3.83100800071657691546e+22, 2.83075330327469394756e+23, + 2.09165949601299610311e+24, 1.54553893559010391826e+25, + 1.14200738981568423454e+26, 8.43835666874145383188e+26, + 6.23514908081161674391e+27, 4.60718663433129178064e+28, + 3.40427604993174075827e+29, 2.51543867091916687979e+30, + 1.85867174528412788702e+31, 1.37338297954017610775e+32, + 1.01480038811388874615e+33, 7.49841699699012090701e+33, + 5.54062238439350983445e+34, 4.09399696212745451138e+35, + 3.02507732220114256223e+36, 2.23524660373471497416e+37, + 1.65163625499400180987e+38, 1.22040329431784083418e+39, + 9.01762840503429851945e+39, 6.66317621641089618500e+40, + 4.92345828601205826106e+41, 3.63797094760880474988e+42, + 2.68811714181613560943e+43, 1.98626483613765434356e+44, + 1.46766223015544238535e+45, 1.08446385529002313207e+46, + 8.01316426400059069850e+46, 5.92097202766466993617e+47, + 4.37503944726134096988e+48, 3.23274119108485947460e+49, + 2.38869060142499127023e+50, 1.76501688569176554670e+51, + 1.30418087839363225614e+52, 9.63666567360320166416e+52, + 7.12058632688933793173e+53, 5.26144118266638596909e+54, +}; + +/* + * EN[k] = exp(-2^(k-5)) + */ +static const double EN[] = { + 9.69233234476344129860e-01, 9.39413062813475807644e-01, + 8.82496902584595455110e-01, 7.78800783071404878477e-01, + 6.06530659712633424263e-01, 3.67879441171442334024e-01, + 1.35335283236612702318e-01, 1.83156388887341786686e-02, + 3.35462627902511853224e-04, 1.12535174719259116458e-07, + 1.26641655490941755372e-14, 1.60381089054863792659e-28, + 2.57220937264241481170e-56, +}; + +static const float zF[] = { + 0.0f, + 1.0f, + 5.0000000951292138e-01F, + 1.6666518897347284e-01F, + 3.4028234663852885981170E+38F, + 1.1754943508222875079688E-38F, + 88.72283935546875, + -103.972076416 +}; + +#define zero zF[0] +#define one zF[1] +#define p1 zF[2] +#define p2 zF[3] +#define big zF[4] +#define tiny zF[5] +#define thresh1 zF[6] +#define thresh2 zF[7] + + +float +__expf_niagara4 (float xf) +{ + double w, p, q; + int hx, ix, n; + float retval; + union + { + int hx_part; + float my_xf; + } hxx; + hxx.my_xf = xf; + hx = hxx.hx_part; + ix = hx & ~0x80000000; + + if (ix < 0x3c800000) + { /* |x| < 2**-6 */ + if (ix < 0x38800000) /* |x| < 2**-14 */ + return (one + xf); + return (one + (xf + (xf * xf) * (p1 + xf * p2))); + } + + n = ix >> 23; /* biased exponent */ + + if (n >= 0x85) + { /* |x| >= 2^6 */ + if (n >= 0x86) + { /* |x| >= 2^7 */ + if (n >= 0xff) + { /* x is nan of +-inf */ + if (hx == 0xff800000) + return (zero); /* exp(-inf)=0 */ + return (xf * xf); /* exp(nan/inf) is nan or inf */ + } + if (hx > 0) + { + retval = big * big; /* overflow */ + __set_err_expf (hxx.my_xf, retval); + return (retval); + } + else + { + retval = tiny * tiny; /* underflow */ + __set_err_expf (hxx.my_xf, retval); + return (retval); + } + } + if (hxx.my_xf >= thresh1) + { + retval = big * big; /* overflow */ + __set_err_expf (hxx.my_xf, retval); + return (retval); + } + if (hxx.my_xf <= thresh2) + { + retval = tiny * tiny; /* underflow */ + __set_err_expf (hxx.my_xf, retval); + return (retval); + } + } + ix -= n << 23; + if (hx > 0) + ix += 0x800000; + else + ix = 0x800000 - ix; + if (n >= 0x7f) + { /* n >= 0 */ + ix <<= n - 0x7f; + w = ET[(ix & 0x3f) + 64] * ET[((ix >> 6) & 0x3f) + 128]; + p = ET[((ix >> 12) & 0x3f) + 192] * ET[((ix >> 18) & 0x3f) + 256]; + q = ET[((ix >> 24) & 0x3f) + 320]; + } + else + { + ix <<= n - 0x79; + w = ET[ix & 0x3f] * ET[((ix >> 6) & 0x3f) + 64]; + p = ET[((ix >> 12) & 0x3f) + 128] * ET[((ix >> 18) & 0x3f) + 192]; + q = ET[((ix >> 24) & 0x3f) + 256]; + } + xf = (float) ((w * p) * (hx < 0 ? q * EN[n - 0x79] : q)); + return (xf); +} -- 1.7.1

Sparc exp(), expf() performance improvement

Commit Message

Comments

Patch