re PR libfortran/78379 (Processor-specific versions for matmul)
2017-03-02 Thomas Koenig <tkoenig@gcc.gnu.org> PR fortran/78379 * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for reals. Add fma to target options. (matmul_'rtype_code`): Call AVX2 only if FMA is available. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated. From-SVN: r245836
This commit is contained in:
parent
db9f7f657e
commit
6d03bdcc81
@ -1,3 +1,23 @@
|
||||
2017-03-02 Thomas Koenig <tkoenig@gcc.gnu.org>
|
||||
|
||||
PR fortran/78379
|
||||
* m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for
|
||||
reals. Add fma to target options.
|
||||
(matmul_'rtype_code`): Call AVX2 only if FMA is available.
|
||||
* generated/matmul_c10.c: Regenerated.
|
||||
* generated/matmul_c16.c: Regenerated.
|
||||
* generated/matmul_c4.c: Regenerated.
|
||||
* generated/matmul_c8.c: Regenerated.
|
||||
* generated/matmul_i1.c: Regenerated.
|
||||
* generated/matmul_i16.c: Regenerated.
|
||||
* generated/matmul_i2.c: Regenerated.
|
||||
* generated/matmul_i4.c: Regenerated.
|
||||
* generated/matmul_i8.c: Regenerated.
|
||||
* generated/matmul_r10.c: Regenerated.
|
||||
* generated/matmul_r16.c: Regenerated.
|
||||
* generated/matmul_r4.c: Regenerated.
|
||||
* generated/matmul_r8.c: Regenerated.
|
||||
|
||||
2017-02-27 Janne Blomqvist <jb@gcc.gnu.org>
|
||||
|
||||
* intrinsics/random.c (getosrandom): Don't try to use rand_s on
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_c10);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_c10_avx (gfc_array_c10 * const restrict retarray,
|
||||
static void
|
||||
matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_c10_avx2 (gfc_array_c10 * const restrict retarray,
|
||||
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_c10_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_c16);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_c16_avx (gfc_array_c16 * const restrict retarray,
|
||||
static void
|
||||
matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_c16_avx2 (gfc_array_c16 * const restrict retarray,
|
||||
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_c16_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_c4);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_c4_avx (gfc_array_c4 * const restrict retarray,
|
||||
static void
|
||||
matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_c4_avx2 (gfc_array_c4 * const restrict retarray,
|
||||
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_c4_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_c8);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_c8_avx (gfc_array_c8 * const restrict retarray,
|
||||
static void
|
||||
matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_c8_avx2 (gfc_array_c8 * const restrict retarray,
|
||||
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_c8_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_i1 (gfc_array_i1 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_i1);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray,
|
||||
static void
|
||||
matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_i1_avx2 (gfc_array_i1 * const restrict retarray,
|
||||
gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_i1_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_i16);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray,
|
||||
static void
|
||||
matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_i16_avx2 (gfc_array_i16 * const restrict retarray,
|
||||
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_i16_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_i2 (gfc_array_i2 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_i2);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray,
|
||||
static void
|
||||
matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_i2_avx2 (gfc_array_i2 * const restrict retarray,
|
||||
gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_i2_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_i4);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray,
|
||||
static void
|
||||
matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_i4_avx2 (gfc_array_i4 * const restrict retarray,
|
||||
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_i4_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,9 +74,6 @@ extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_i8);
|
||||
|
||||
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -628,7 +625,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray,
|
||||
static void
|
||||
matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_i8_avx2 (gfc_array_i8 * const restrict retarray,
|
||||
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
|
||||
@ -2277,7 +2274,8 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_i8_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,13 +74,6 @@ extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_r10);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -632,7 +625,7 @@ matmul_r10_avx (gfc_array_r10 * const restrict retarray,
|
||||
static void
|
||||
matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_r10_avx2 (gfc_array_r10 * const restrict retarray,
|
||||
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
|
||||
@ -2281,7 +2274,8 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_r10_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,13 +74,6 @@ extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_r16);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -632,7 +625,7 @@ matmul_r16_avx (gfc_array_r16 * const restrict retarray,
|
||||
static void
|
||||
matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_r16_avx2 (gfc_array_r16 * const restrict retarray,
|
||||
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
|
||||
@ -2281,7 +2274,8 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_r16_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,13 +74,6 @@ extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_r4);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -632,7 +625,7 @@ matmul_r4_avx (gfc_array_r4 * const restrict retarray,
|
||||
static void
|
||||
matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_r4_avx2 (gfc_array_r4 * const restrict retarray,
|
||||
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
|
||||
@ -2281,7 +2274,8 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_r4_avx2;
|
||||
goto tailcall;
|
||||
|
@ -74,13 +74,6 @@ extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_r8);
|
||||
|
||||
#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif
|
||||
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -632,7 +625,7 @@ matmul_r8_avx (gfc_array_r8 * const restrict retarray,
|
||||
static void
|
||||
matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static void
|
||||
matmul_r8_avx2 (gfc_array_r8 * const restrict retarray,
|
||||
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
|
||||
@ -2281,7 +2274,8 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_r8_avx2;
|
||||
goto tailcall;
|
||||
|
@ -75,14 +75,6 @@ extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
int blas_limit, blas_call gemm);
|
||||
export_proto(matmul_'rtype_code`);
|
||||
|
||||
'ifelse(rtype_letter,`r',dnl
|
||||
`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
|
||||
/* REAL types generate identical code for AVX and AVX2. Only generate
|
||||
an AVX2 function if we are dealing with integer. */
|
||||
#undef HAVE_AVX2
|
||||
#endif')
|
||||
`
|
||||
|
||||
/* Put exhaustive list of possible architectures here here, ORed together. */
|
||||
|
||||
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
|
||||
@ -101,7 +93,7 @@ static' include(matmul_internal.m4)dnl
|
||||
`static void
|
||||
'matmul_name` ('rtype` * const restrict retarray,
|
||||
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
|
||||
int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
|
||||
static' include(matmul_internal.m4)dnl
|
||||
`#endif /* HAVE_AVX2 */
|
||||
|
||||
@ -147,7 +139,8 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray,
|
||||
#endif /* HAVE_AVX512F */
|
||||
|
||||
#ifdef HAVE_AVX2
|
||||
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
|
||||
&& (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
|
||||
{
|
||||
matmul_p = matmul_'rtype_code`_avx2;
|
||||
goto tailcall;
|
||||
|
Loading…
Reference in New Issue
Block a user