Add AVX optimized versions for some x86-64 math functions

This commit is contained in:
Ulrich Drepper 2011-10-25 21:34:55 -04:00
parent ffb124cc51
commit e0016b11d6
30 changed files with 370 additions and 92 deletions

View File

@ -1,5 +1,40 @@
2011-10-25 Ulrich Drepper <drepper@gmail.com>
* sysdeps/ieee754/dbl-64/e_rem_pio2.c: Comment everything out, the
file is not needed.
* sysdeps/x86_64/fpu/multiarch/e_asin.c: Support AVX variants.
* sysdeps/x86_64/fpu/multiarch/e_atan2.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/e_exp.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/e_log.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/s_atan.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/s_sin.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/s_tan.c: Likewise.
* sysdeps/x86_64/fpu/multiarch/Makefile: Fix some CFLAGS-* variables.
Add AVX variants.
* sysdeps/x86_64/fpu/multiarch/brandred-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/doasin-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/dosincos-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/e_asin-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/e_atan2-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/e_exp-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/e_log-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mpa-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mpatan-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mpatan2-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mpexp-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mplog-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mpsqrt-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/mptan-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/s_atan-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/s_sin-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/s_tan-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/sincos32-avx.c: New file.
* sysdeps/x86_64/fpu/multiarch/slowexp-avx.c: New file.
* sysdeps/x86_64/multiarch/init-arch.h: Make bit_* macros available
all the time. Define bit_AVX. Define HAS_* macros using bit_* macros.
* sysdeps/x86_64/multiarch/strcmp-sse42.S: Move common code to earlier
place. Use VEX encoding when compiling for AVX.

View File

@ -1,22 +1,19 @@
/* @(#)e_rem_pio2.c 5.1 93/09/24 */
#ifdef NOT_NEEDED_ANYMORE
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(LIBM_SCCS) && !defined(lint)
static char rcsid[] = "$NetBSD: e_rem_pio2.c,v 1.8 1995/05/10 20:46:02 jtc Exp $";
#endif
/* __ieee754_rem_pio2(x,y)
*
* return the remainder of x rem pi/2 in y[0]+y[1]
*
* return the remainder of x rem pi/2 in y[0]+y[1]
* use __kernel_rem_pio2()
*/
@ -24,31 +21,23 @@ static char rcsid[] = "$NetBSD: e_rem_pio2.c,v 1.8 1995/05/10 20:46:02 jtc Exp $
#include "math_private.h"
/*
* Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
* Table of constants for 2/pi, 396 Hex digits (476 decimal) of 2/pi
*/
#ifdef __STDC__
static const int32_t two_over_pi[] = {
#else
static int32_t two_over_pi[] = {
#endif
0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
};
#ifdef __STDC__
static const int32_t npio2_hw[] = {
#else
static int32_t npio2_hw[] = {
#endif
0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C,
0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C,
0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A,
@ -67,11 +56,7 @@ static int32_t npio2_hw[] = {
* pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
*/
#ifdef __STDC__
static const double
#else
static double
#endif
static const double
zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
@ -83,12 +68,8 @@ pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
#ifdef __STDC__
int32_t __ieee754_rem_pio2(double x, double *y)
#else
int32_t __ieee754_rem_pio2(x,y)
double x,y[];
#endif
int32_t
__ieee754_rem_pio2(double x, double *y)
{
double z,w,t,r,fn;
double tx[3];
@ -100,9 +81,9 @@ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
if(ix<=0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
{y[0] = x; y[1] = 0; return 0;}
if(ix<0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */
if(hx>0) {
if(hx>0) {
z = x - pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z - pio2_1t;
y[1] = (z-y[0])-pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
@ -113,7 +94,7 @@ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
return 1;
} else { /* negative x */
z = x + pio2_1;
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
if(ix!=0x3ff921fb) { /* 33+53 bit pi is good enough */
y[0] = z + pio2_1t;
y[1] = (z-y[0])+pio2_1t;
} else { /* near pi/2, use 33+33+53 bit pi */
@ -130,36 +111,36 @@ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
fn = (double)n;
r = t-fn*pio2_1;
w = fn*pio2_1t; /* 1st round good to 85 bit */
if(n<32&&ix!=npio2_hw[n-1]) {
if(n<32&&ix!=npio2_hw[n-1]) {
y[0] = r-w; /* quick check no cancellation */
} else {
u_int32_t high;
j = ix>>20;
y[0] = r-w;
u_int32_t high;
j = ix>>20;
y[0] = r-w;
GET_HIGH_WORD(high,y[0]);
i = j-((high>>20)&0x7ff);
if(i>16) { /* 2nd iteration needed, good to 118 */
i = j-((high>>20)&0x7ff);
if(i>16) { /* 2nd iteration needed, good to 118 */
t = r;
w = fn*pio2_2;
w = fn*pio2_2;
r = t-w;
w = fn*pio2_2t-((t-r)-w);
w = fn*pio2_2t-((t-r)-w);
y[0] = r-w;
GET_HIGH_WORD(high,y[0]);
i = j-((high>>20)&0x7ff);
if(i>49) { /* 3rd iteration need, 151 bits acc */
t = r; /* will cover all possible cases */
w = fn*pio2_3;
r = t-w;
w = fn*pio2_3t-((t-r)-w);
y[0] = r-w;
t = r; /* will cover all possible cases */
w = fn*pio2_3;
r = t-w;
w = fn*pio2_3t-((t-r)-w);
y[0] = r-w;
}
}
}
y[1] = (r-y[0])-w;
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
else return n;
}
/*
/*
* all other (large) arguments
*/
if(ix>=0x7ff00000) { /* x is inf or NaN */
@ -168,7 +149,7 @@ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
/* set z = scalbn(|x|,ilogb(x)-23) */
GET_LOW_WORD(low,x);
SET_LOW_WORD(z,low);
e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */
e0 = (ix>>20)-1046; /* e0 = ilogb(z)-23; */
SET_HIGH_WORD(z, ix - ((int32_t)(e0<<20)));
for(i=0;i<2;i++) {
tx[i] = (double)((int32_t)(z));
@ -181,3 +162,5 @@ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
return n;
}
#endif

View File

@ -30,7 +30,36 @@ CFLAGS-s_atan-fma4.c = -mfma4
CFLAGS-sincos32-fma4.c = -mfma4
CFLAGS-slowexp-fma4.c = -mfma4
CFLAGS-slowpow-fma4.c = -mfma4
CLFAGS-s_sin-fma4.c = -mfma4
CLFAGS-s_tan-fma4.c = -mfma4
CFLAGS-s_sin-fma4.c = -mfma4
CFLAGS-s_tan-fma4.c = -mfma4
endif
ifeq ($(config-cflags-avx),yes)
libm-sysdep_routines += e_exp-avx e_log-avx s_atan-avx \
e_asin-avx e_atan2-avx s_sin-avx s_tan-avx \
mplog-avx mpa-avx slowexp-avx \
sincos32-avx doasin-avx dosincos-avx \
brandred-avx mpexp-avx \
mpatan2-avx mpatan-avx mpsqrt-avx mptan-avx
CFLAGS-brandred-avx.c = -mavx
CFLAGS-doasin-avx.c = -mavx
CFLAGS-dosincos-avx.c = -mavx
CFLAGS-e_asin-avx.c = -mavx
CFLAGS-e_atan2-avx.c = -mavx
CFLAGS-e_exp-avx.c = -mavx
CFLAGS-e_log-avx.c = -mavx
CFLAGS-mpa-avx.c = -mavx
CFLAGS-mpatan-avx.c = -mavx
CFLAGS-mpatan2-avx.c = -mavx
CFLAGS-mpexp-avx.c = -mavx
CFLAGS-mplog-avx.c = -mavx
CFLAGS-mpsqrt-avx.c = -mavx
CFLAGS-mptan-avx.c = -mavx
CFLAGS-s_atan-avx.c = -mavx
CFLAGS-s_sin-avx.c = -mavx
CFLAGS-sincos32-avx.c = -mavx
CFLAGS-slowexp-avx.c = -mavx
CFLAGS-s_tan-avx.c = -mavx
endif
endif

View File

@ -0,0 +1,4 @@
#define __branred __branred_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/branred.c>

View File

@ -0,0 +1,4 @@
#define __doasin __doasin_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/doasin.c>

View File

@ -0,0 +1,6 @@
#define __docos __docos_avx
#define __dubcos __dubcos_avx
#define __dubsin __dubsin_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/dosincos.c>

View File

@ -0,0 +1,11 @@
#define __ieee754_acos __ieee754_acos_avx
#define __ieee754_asin __ieee754_asin_avx
#define __cos32 __cos32_avx
#define __doasin __doasin_avx
#define __docos __docos_avx
#define __dubcos __dubcos_avx
#define __dubsin __dubsin_avx
#define __sin32 __sin32_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/e_asin.c>

View File

@ -1,18 +1,29 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math_private.h>
extern double __ieee754_acos_sse2 (double);
extern double __ieee754_acos_fma4 (double);
extern double __ieee754_asin_sse2 (double);
extern double __ieee754_acos_avx (double);
extern double __ieee754_asin_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __ieee754_acos_fma4 (double);
extern double __ieee754_asin_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __ieee754_acos_fma4 ((void *) 0)
# define __ieee754_asin_fma4 ((void *) 0)
# endif
libm_ifunc (__ieee754_acos,
HAS_FMA4 ? __ieee754_acos_fma4 : __ieee754_acos_sse2);
HAS_FMA4 ? __ieee754_acos_fma4
: (HAS_AVX ? __ieee754_acos_avx : __ieee754_acos_sse2));
strong_alias (__ieee754_acos, __acos_finite)
libm_ifunc (__ieee754_asin,
HAS_FMA4 ? __ieee754_asin_fma4 : __ieee754_asin_sse2);
HAS_FMA4 ? __ieee754_asin_fma4
: (HAS_AVX ? __ieee754_asin_avx : __ieee754_asin_sse2));
strong_alias (__ieee754_asin, __asin_finite)
# define __ieee754_acos __ieee754_acos_sse2

View File

@ -0,0 +1,10 @@
#define __ieee754_atan2 __ieee754_atan2_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __dvd __dvd_avx
#define __mpatan2 __mpatan2_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/e_atan2.c>

View File

@ -1,12 +1,20 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math_private.h>
extern double __ieee754_atan2_sse2 (double, double);
extern double __ieee754_atan2_avx (double, double);
# ifdef HAVE_FMA4_SUPPORT
extern double __ieee754_atan2_fma4 (double, double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __ieee754_atan2_fma4 ((void *) 0)
# endif
libm_ifunc (__ieee754_atan2,
HAS_FMA4 ? __ieee754_atan2_fma4 : __ieee754_atan2_sse2);
HAS_FMA4 ? __ieee754_atan2_fma4
: (HAS_AVX ? __ieee754_atan2_avx : __ieee754_atan2_sse2));
strong_alias (__ieee754_atan2, __atan2_finite)
# define __ieee754_atan2 __ieee754_atan2_sse2

View File

@ -0,0 +1,6 @@
#define __ieee754_exp __ieee754_exp_avx
#define __exp1 __exp1_avx
#define __slowexp __slowexp_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/e_exp.c>

View File

@ -1,11 +1,20 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math_private.h>
extern double __ieee754_exp_sse2 (double);
extern double __ieee754_exp_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __ieee754_exp_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __ieee754_exp_fma4 ((void *) 0)
# endif
libm_ifunc (__ieee754_exp, HAS_FMA4 ? __ieee754_exp_fma4 : __ieee754_exp_sse2);
libm_ifunc (__ieee754_exp,
HAS_FMA4 ? __ieee754_exp_fma4
: (HAS_AVX ? __ieee754_exp_avx : __ieee754_exp_sse2));
strong_alias (__ieee754_exp, __exp_finite)
# define __ieee754_exp __ieee754_exp_sse2

View File

@ -0,0 +1,8 @@
#define __ieee754_log __ieee754_log_avx
#define __mplog __mplog_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/e_log.c>

View File

@ -1,11 +1,21 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math_private.h>
extern double __ieee754_log_sse2 (double);
extern double __ieee754_log_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __ieee754_log_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __ieee754_log_fma4 ((void *) 0)
# endif
libm_ifunc (__ieee754_log, HAS_FMA4 ? __ieee754_log_fma4 : __ieee754_log_sse2);
libm_ifunc (__ieee754_log,
HAS_FMA4 ? __ieee754_log_fma4
: (HAS_AVX ? __ieee754_log_avx
: __ieee754_log_sse2));
strong_alias (__ieee754_log, __log_finite)
# define __ieee754_log __ieee754_log_sse2

View File

@ -0,0 +1,12 @@
#define __add __add_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define __dbl_mp __dbl_mp_avx
#define __dvd __dvd_avx
#define NO___CPY 1
#define NO___MP_DBL 1
#define NO___ACR 1
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mpa.c>

View File

@ -0,0 +1,10 @@
#define __mpatan __mpatan_avx
#define __add __add_avx
#define __dvd __dvd_avx
#define __mpsqrt __mpsqrt_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define AVOID_MPATAN_H 1
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mpatan.c>

View File

@ -0,0 +1,9 @@
#define __mpatan2 __mpatan2_avx
#define __add __add_avx
#define __dvd __dvd_avx
#define __mpatan __mpatan_avx
#define __mpsqrt __mpsqrt_avx
#define __mul __mul_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mpatan2.c>

View File

@ -0,0 +1,9 @@
#define __mpexp __mpexp_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __dvd __dvd_avx
#define __mul __mul_avx
#define AVOID_MPEXP_H 1
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mpexp.c>

View File

@ -0,0 +1,8 @@
#define __mplog __mplog_avx
#define __add __add_avx
#define __mpexp __mpexp_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mplog.c>

View File

@ -0,0 +1,8 @@
#define __mpsqrt __mpsqrt_avx
#define __dbl_mp __dbl_mp_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define AVOID_MPSQRT_H 1
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mpsqrt.c>

View File

@ -0,0 +1,7 @@
#define __mptan __mptan_avx
#define __c32 __c32_avx
#define __dvd __dvd_avx
#define __mpranred __mpranred_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/mptan.c>

View File

@ -0,0 +1,9 @@
#define atan __atan_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __mpatan __mpatan_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/s_atan.c>

View File

@ -1,11 +1,18 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math.h>
extern double __atan_sse2 (double);
extern double __atan_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __atan_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __atan_fma4 ((void *) 0)
# endif
libm_ifunc (atan, HAS_FMA4 ? __atan_fma4 : __atan_sse2);
libm_ifunc (atan, HAS_FMA4 ? __atan_fma4 : HAS_AVX ? __atan_avx : __atan_sse2);
# define atan __atan_sse2
#endif

View File

@ -0,0 +1,12 @@
#define __cos __cos_avx
#define __sin __sin_avx
#define __branred __branred_avx
#define __docos __docos_avx
#define __dubsin __dubsin_avx
#define __mpcos __mpcos_avx
#define __mpcos1 __mpcos1_avx
#define __mpsin __mpsin_avx
#define __mpsin1 __mpsin1_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/s_sin.c>

View File

@ -1,17 +1,26 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math.h>
# undef NAN
extern double __cos_sse2 (double);
extern double __cos_fma4 (double);
extern double __sin_sse2 (double);
extern double __cos_avx (double);
extern double __sin_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __cos_fma4 (double);
extern double __sin_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __cos_fma4 ((void *) 0)
# define __sin_fma4 ((void *) 0)
# endif
libm_ifunc (__cos, HAS_FMA4 ? __cos_fma4 : __cos_sse2);
libm_ifunc (__cos, HAS_FMA4 ? __cos_fma4 : HAS_AVX ? __cos_avx : __cos_sse2);
weak_alias (__cos, cos)
libm_ifunc (__sin, HAS_FMA4 ? __sin_fma4 : __sin_sse2);
libm_ifunc (__sin, HAS_FMA4 ? __sin_fma4 : HAS_AVX ? __sin_avx : __sin_sse2);
weak_alias (__sin, sin)
# define __cos __cos_sse2

View File

@ -0,0 +1,9 @@
#define tan __tan_avx
#define __branred __branred_avx
#define __dbl_mp __dbl_mp_avx
#define __mpranred __mpranred_avx
#define __mptan __mptan_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/s_tan.c>

View File

@ -1,11 +1,18 @@
#ifdef HAVE_FMA4_SUPPORT
#if defined HAVE_FMA4_SUPPORT || defined HAVE_AVX_SUPPORT
# include <init-arch.h>
# include <math.h>
extern double __tan_sse2 (double);
extern double __tan_avx (double);
# ifdef HAVE_FMA4_SUPPORT
extern double __tan_fma4 (double);
# else
# undef HAS_FMA4
# define HAS_FMA4 0
# define __tan_fma4 ((void *) 0)
# endif
libm_ifunc (tan, HAS_FMA4 ? __tan_fma4 : __tan_sse2);
libm_ifunc (tan, HAS_FMA4 ? __tan_fma4 : HAS_AVX ? __tan_avx : __tan_sse2);
# define tan __tan_sse2
#endif

View File

@ -0,0 +1,15 @@
#define __cos32 __cos32_avx
#define __sin32 __sin32_avx
#define __c32 __c32_avx
#define __mpsin __mpsin_avx
#define __mpsin1 __mpsin1_avx
#define __mpcos __mpcos_avx
#define __mpcos1 __mpcos1_avx
#define __mpranred __mpranred_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/sincos32.c>

View File

@ -0,0 +1,9 @@
#define __slowexp __slowexp_avx
#define __add __add_avx
#define __dbl_mp __dbl_mp_avx
#define __mpexp __mpexp_avx
#define __mul __mul_avx
#define __sub __sub_avx
#define SECTION __attribute__ ((section (".text.avx")))
#include <sysdeps/ieee754/dbl-64/slowexp.c>

View File

@ -23,16 +23,19 @@
#define bit_Fast_Unaligned_Load (1 << 4)
#define bit_Prefer_PMINUB_for_stringop (1 << 5)
#define bit_SSE2 (1 << 26)
#define bit_SSSE3 (1 << 9)
#define bit_SSE4_1 (1 << 19)
#define bit_SSE4_2 (1 << 20)
#define bit_AVX (1 << 28)
#define bit_POPCOUNT (1 << 23)
#define bit_FMA (1 << 12)
#define bit_FMA4 (1 << 16)
#ifdef __ASSEMBLER__
# include <ifunc-defines.h>
# define bit_SSE2 (1 << 26)
# define bit_SSSE3 (1 << 9)
# define bit_SSE4_1 (1 << 19)
# define bit_SSE4_2 (1 << 20)
# define bit_AVX (1 << 28)
# define index_SSE2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
# define index_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
@ -104,17 +107,18 @@ extern const struct cpu_features *__get_cpu_features (void)
# endif
# define HAS_CPU_FEATURE(idx, reg, bit) \
((__get_cpu_features ()->cpuid[idx].reg & (1 << (bit))) != 0)
((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
/* Following are the feature tests used throughout libc. */
# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, 26)
# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 23)
# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 9)
# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 19)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
# define HAS_FMA4 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_80000001, ecx, 16)
# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
# define HAS_POPCOUNT HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT)
# define HAS_SSSE3 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3)
# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_FMA)
# define HAS_AVX HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_AVX)
# define HAS_FMA4 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
# define index_Fast_Rep_String FEATURE_INDEX_1
# define index_Fast_Copy_Backward FEATURE_INDEX_1