PowerPC: Set/restore rounding mode only when needed

This patch helps some math functions performance by adding the libc_fexxx
variant of inline functions to handle both FPU round and exception set/restore
and by using them on the libc_fexxx_ctx functions. It is based on already coded
fexxx family functions for PPC with fpu.

Here is the summary of performance improvements due this patch (measured on a
POWER7 machine):

Before:

cos(): ITERS:9.5895e+07: TOTAL:5116.03Mcy, MAX:77.6cy, MIN:49.792cy, 18744 calls/Mcy
exp(): ITERS:2.827e+07: TOTAL:5187.15Mcy, MAX:494.018cy, MIN:38.422cy, 5450.01 calls/Mcy
pow(): ITERS:6.1705e+07: TOTAL:5144.26Mcy, MAX:171.95cy, MIN:29.935cy, 11994.9 calls/Mcy
sin(): ITERS:8.6898e+07: TOTAL:5117.06Mcy, MAX:83.841cy, MIN:46.582cy, 16982 calls/Mcy
tan(): ITERS:2.9473e+07: TOTAL:5115.39Mcy, MAX:191.017cy, MIN:172.352cy, 5761.63 calls/Mcy

After:

cos(): ITERS:2.05265e+08: TOTAL:5111.37Mcy, MAX:78.754cy, MIN:24.196cy, 40158.5 calls/Mcy
exp(): ITERS:3.341e+07: TOTAL:5170.84Mcy, MAX:476.317cy, MIN:15.574cy, 6461.23 calls/Mcy
pow(): ITERS:7.6153e+07: TOTAL:5129.1Mcy, MAX:147.5cy, MIN:30.916cy, 14847.2 calls/Mcy
sin(): ITERS:1.58816e+08: TOTAL:5115.11Mcy, MAX:1490.39cy, MIN:22.341cy, 31048.4 calls/Mcy
tan(): ITERS:3.4964e+07: TOTAL:5114.18Mcy, MAX:177.422cy, MIN:146.115cy, 6836.68 calls/Mcy
This commit is contained in:
Adhemerval Zanella 2013-11-25 06:12:30 -06:00
parent 085b5ddfe3
commit 41e8926aa4
8 changed files with 308 additions and 24 deletions

View File

@ -1,3 +1,19 @@
2013-11-25 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/ieee754/ldbl-128ibm/e_expl.c (__ieee754_expl): Use
SET_RESTORE_ROUND instead of feholdexcept/fesetround/fesetenv.
* sysdeps/powerpc/fpu/fenv_libc.h (__fegetround): Remove define.
(__fesetround): Remove define.
* sysdeps/powerpc/fpu/fenv_private.h: New file: Inline floating point
rounding and exceptions handling.
* sysdeps/powerpc/fpu/math_private.h: Include fenv_private.h.
* sysdeps/powerpc/fpu/fenv_libc.h (__fe_mask_env): Define as hidden.
(__fe_nomask_env): Likewise.
* sysdeps/ieee754/ldbl-128ibm/s_llrintl.c (__llrintl): Use
__fegetround instead of fegetround.
* sysdeps/ieee754/ldbl-128ibm/s_lrintl.c (__lrintl): Likewise.
* sysdeps/ieee754/ldbl-128ibm/s_rintl.c (__rintl): Likewise.
2013-11-21 Roland McGrath <roland@hack.frob.com>
* malloc/malloc.c: Move #include <sys/param.h> to the top; comment why

View File

@ -134,18 +134,17 @@ static const long double C[] = {
long double
__ieee754_expl (long double x)
{
long double result, x22;
union ibm_extended_long_double ex2_u, scale_u;
int unsafe;
/* Check for usual case. */
if (isless (x, himark) && isgreater (x, lomark))
{
int tval1, tval2, unsafe, n_i, exponent2;
long double x22, n, result, xl;
union ibm_extended_long_double ex2_u, scale_u;
fenv_t oldenv;
int tval1, tval2, n_i, exponent2;
long double n, xl;
feholdexcept (&oldenv);
#ifdef FE_TONEAREST
fesetround (FE_TONEAREST);
#endif
SET_RESTORE_ROUND (FE_TONEAREST);
n = __roundl (x*M_1_LN2);
x = x-n*M_LN2_0;
@ -201,11 +200,6 @@ __ieee754_expl (long double x)
less than 4.8e-39. */
x22 = x + x*x*(P1+x*(P2+x*(P3+x*(P4+x*(P5+x*P6)))));
/* Return result. */
fesetenv (&oldenv);
result = x22 * ex2_u.ld + ex2_u.ld;
/* Now we can test whether the result is ultimate or if we are unsure.
In the later case we should probably call a mpn based routine to give
the ultimate result.
@ -235,10 +229,6 @@ __ieee754_expl (long double x)
return __ieee754_expl_proc2 (origx);
}
*/
if (!unsafe)
return result;
else
return result * scale_u.ld;
}
/* Exceptional cases: */
else if (isless (x, himark))
@ -253,5 +243,10 @@ __ieee754_expl (long double x)
else
/* Return x, if x is a NaN or Inf; or overflow, otherwise. */
return TWO1023*x;
result = x22 * ex2_u.ld + ex2_u.ld;
if (!unsafe)
return result;
return result * scale_u.ld;
}
strong_alias (__ieee754_expl, __expl_finite)

View File

@ -43,7 +43,7 @@ __llrintl (long double x)
#endif
)
{
save_round = fegetround ();
save_round = __fegetround ();
if (__builtin_expect ((xh == -(double) (-__LONG_LONG_MAX__ - 1)), 0))
{

View File

@ -49,7 +49,7 @@ __lrintl (long double x)
#endif
)
{
save_round = fegetround ();
save_round = __fegetround ();
#if __LONG_MAX__ == 2147483647
long long llhi = (long long) xh;

View File

@ -40,7 +40,7 @@ __rintl (long double x)
__builtin_inf ()), 1))
{
double orig_xh;
int save_round = fegetround ();
int save_round = __fegetround ();
/* Long double arithmetic, including the canonicalisation below,
only works in round-to-nearest mode. */

View File

@ -23,9 +23,9 @@
#include <ldsodefs.h>
#include <sysdep.h>
extern const fenv_t *__fe_nomask_env (void);
extern const fenv_t *__fe_nomask_env (void) attribute_hidden;
extern const fenv_t *__fe_mask_env (void);
extern const fenv_t *__fe_mask_env (void) attribute_hidden;
/* The sticky bits in the FPSCR indicating exceptions have occurred. */
#define FPSCR_STICKY_BITS ((FE_ALL_EXCEPT | FE_ALL_INVALID) & ~FE_INVALID)
@ -83,7 +83,6 @@ __fegetround (void)
"mfcr %0" : "=r"(result) : : "cr7");
return result & 3;
}
#define fegetround() __fegetround()
static inline int
__fesetround (int round)
@ -107,7 +106,6 @@ __fesetround (int round)
return 0;
}
#define fesetround(mode) __fesetround(mode)
/* Definitions of all the FPSCR bit numbers */
enum {

View File

@ -0,0 +1,274 @@
/* Private floating point rounding and exceptions handling. PowerPC version.
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
#ifndef FENV_PRIVATE_H
#define FENV_PRIVATE_H 1
#include <fenv.h>
#include <fenv_libc.h>
#include <fpu_control.h>
#define _FPU_MASK_ALL (_FPU_MASK_ZM | _FPU_MASK_OM | _FPU_MASK_UM \
| _FPU_MASK_XM | _FPU_MASK_IM)
/* Mask everything but the rounding moded and non-IEEE arithmetic flags. */
#define _FPU_MASK_ROUNDING 0xffffffff00000007LL
/* Mask restore rounding mode and exception enabled. */
#define _FPU_MASK_EXCEPT_ROUND 0xffffffff1fffff00LL
/* Mask exception enable but fraction rounded/inexact and FP result/CC
bits. */
#define _FPU_MASK_FRAC_INEX_RET_CC 0x1ff80fff
static __always_inline void
libc_feholdexcept_ppc (fenv_t *envp)
{
fenv_union_t old, new;
old.fenv = *envp = fegetenv_register ();
new.l = old.l & _FPU_MASK_ROUNDING;
/* If the old env had any enabled exceptions, then mask SIGFPE in the
MSR FE0/FE1 bits. This may allow the FPU to run faster because it
always takes the default action and can not generate SIGFPE. */
if ((old.l & _FPU_MASK_ALL) != 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
}
static __always_inline void
libc_fesetround_ppc (int r)
{
__fesetround (r);
}
static __always_inline void
libc_feholdexcept_setround_ppc (fenv_t *envp, int r)
{
fenv_union_t old, new;
old.fenv = *envp = fegetenv_register ();
new.l = (old.l & _FPU_MASK_ROUNDING) | r;
if ((old.l & _FPU_MASK_ALL) != 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
}
static __always_inline int
libc_fetestexcept_ppc (int e)
{
fenv_union_t u;
u.fenv = fegetenv_register ();
return u.l & e;
}
static __always_inline void
libc_fesetenv_ppc (const fenv_t *envp)
{
fenv_union_t old, new;
new.fenv = *envp;
old.fenv = fegetenv_register ();
/* If the old env has no enabled exceptions and the new env has any enabled
exceptions, then unmask SIGFPE in the MSR FE0/FE1 bits. This will put the
hardware into "precise mode" and may cause the FPU to run slower on some
hardware. */
if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
(void) __fe_nomask_env ();
/* If the old env had any enabled exceptions and the new env has no enabled
exceptions, then mask SIGFPE in the MSR FE0/FE1 bits. This may allow the
FPU to run faster because it always takes the default action and can not
generate SIGFPE. */
if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
(void) __fe_mask_env ();
fesetenv_register (*envp);
}
static __always_inline int
libc_feupdateenv_test_ppc (fenv_t *envp, int ex)
{
fenv_union_t old, new;
new.fenv = *envp;
old.fenv = fegetenv_register ();
/* Restore rounding mode and exception enable from *envp and merge
exceptions. Leave fraction rounded/inexact and FP result/CC bits
unchanged. */
new.l = (old.l & _FPU_MASK_EXCEPT_ROUND)
| (new.l & _FPU_MASK_FRAC_INEX_RET_CC);
if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
(void) __fe_nomask_env ();
if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
return old.l & ex;
}
static __always_inline void
libc_feupdateenv_ppc (fenv_t *e)
{
libc_feupdateenv_test_ppc (e, 0);
}
static __always_inline void
libc_feholdsetround_ppc (fenv_t *e, int r)
{
fenv_union_t old, new;
old.fenv = fegetenv_register ();
/* Clear current precision and set newer one. */
new.l = (old.l & ~0x3) | r;
*e = old.fenv;
if ((old.l & _FPU_MASK_ALL) != 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
}
static __always_inline void
libc_feresetround_ppc (fenv_t *envp)
{
fenv_union_t old, new;
new.fenv = *envp;
old.fenv = fegetenv_register ();
/* Restore rounding mode and exception enable from *envp and merge
exceptions. Leave fraction rounded/inexact and FP result/CC bits
unchanged. */
new.l = (old.l & _FPU_MASK_EXCEPT_ROUND)
| (new.l & _FPU_MASK_FRAC_INEX_RET_CC);
if ((old.l & _FPU_MASK_ALL) == 0 && (new.l & _FPU_MASK_ALL) != 0)
(void) __fe_nomask_env ();
if ((old.l & _FPU_MASK_ALL) != 0 && (new.l & _FPU_MASK_ALL) == 0)
(void) __fe_mask_env ();
/* Atomically enable and raise (if appropriate) exceptions set in `new'. */
fesetenv_register (new.fenv);
}
#define libc_feholdexceptf libc_feholdexcept_ppc
#define libc_feholdexcept libc_feholdexcept_ppc
#define libc_feholdexcept_setroundf libc_feholdexcept_setround_ppc
#define libc_feholdexcept_setround libc_feholdexcept_setround_ppc
#define libc_fetestexceptf libc_fetestexcept_ppc
#define libc_fetestexcept libc_fetestexcept_ppc
#define libc_fesetroundf libc_fesetround_ppc
#define libc_fesetround libc_fesetround_ppc
#define libc_fesetenvf libc_fesetenv_ppc
#define libc_fesetenv libc_fesetenv_ppc
#define libc_feupdateenv_testf libc_feupdateenv_test_ppc
#define libc_feupdateenv_test libc_feupdateenv_test_ppc
#define libc_feupdateenvf libc_feupdateenv_ppc
#define libc_feupdateenv libc_feupdateenv_ppc
#define libc_feholdsetroundf libc_feholdsetround_ppc
#define libc_feholdsetround libc_feholdsetround_ppc
#define libc_feresetroundf libc_feresetround_ppc
#define libc_feresetround libc_feresetround_ppc
/* We have support for rounding mode context. */
#define HAVE_RM_CTX 1
static __always_inline void
libc_feholdexcept_setround_ppc_ctx (struct rm_ctx *ctx, int r)
{
fenv_union_t old, new;
old.fenv = fegetenv_register ();
new.l = (old.l & _FPU_MASK_ROUNDING) | r;
ctx->env = old.fenv;
if (__glibc_unlikely (new.l != old.l))
{
if ((old.l & _FPU_MASK_ALL) != 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
ctx->updated_status = true;
}
else
ctx->updated_status = false;
}
static __always_inline void
libc_fesetenv_ppc_ctx (struct rm_ctx *ctx)
{
libc_fesetenv_ppc (&ctx->env);
}
static __always_inline void
libc_feupdateenv_ppc_ctx (struct rm_ctx *ctx)
{
if (__glibc_unlikely (ctx->updated_status))
libc_feupdateenv_test_ppc (&ctx->env, 0);
}
static __always_inline void
libc_feholdsetround_ppc_ctx (struct rm_ctx *ctx, int r)
{
fenv_union_t old, new;
old.fenv = fegetenv_register ();
new.l = (old.l & ~0x3) | r;
ctx->env = old.fenv;
if (__glibc_unlikely (new.l != old.l))
{
if ((old.l & _FPU_MASK_ALL) != 0)
(void) __fe_mask_env ();
fesetenv_register (new.fenv);
ctx->updated_status = true;
}
else
ctx->updated_status = false;
}
static __always_inline void
libc_feresetround_ppc_ctx (struct rm_ctx *ctx)
{
if (__glibc_unlikely (ctx->updated_status))
libc_feresetround_ppc (&ctx->env);
}
#define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_ppc_ctx
#define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_ppc_ctx
#define libc_fesetenv_ctx libc_fesetenv_ppc_ctx
#define libc_fesetenvf_ctx libc_fesetenv_ppc_ctx
#define libc_feholdsetround_ctx libc_feholdsetround_ppc_ctx
#define libc_feholdsetroundf_ctx libc_feholdsetround_ppc_ctx
#define libc_feresetround_ctx libc_feresetround_ppc_ctx
#define libc_feresetroundf_ctx libc_feresetround_ppc_ctx
#define libc_feupdateenvf_ctx libc_feupdateenv_ppc_ctx
#define libc_feupdateenv_ctx libc_feupdateenv_ppc_ctx
#endif

View File

@ -22,6 +22,7 @@
#include <sysdep.h>
#include <ldsodefs.h>
#include <dl-procinfo.h>
#include <fenv_private.h>
#include_next <math_private.h>
# if __WORDSIZE == 64 || defined _ARCH_PWR4