diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog index 0485e6f698b..ec72c6daa84 100644 --- a/libgfortran/ChangeLog +++ b/libgfortran/ChangeLog @@ -1,3 +1,23 @@ +2017-03-02 Thomas Koenig + + PR fortran/78379 + * m4/matmul.m4: (matmul_'rtype_code`_avx2): Also generate for + reals. Add fma to target options. + (matmul_'rtype_code`): Call AVX2 only if FMA is available. + * generated/matmul_c10.c: Regenerated. + * generated/matmul_c16.c: Regenerated. + * generated/matmul_c4.c: Regenerated. + * generated/matmul_c8.c: Regenerated. + * generated/matmul_i1.c: Regenerated. + * generated/matmul_i16.c: Regenerated. + * generated/matmul_i2.c: Regenerated. + * generated/matmul_i4.c: Regenerated. + * generated/matmul_i8.c: Regenerated. + * generated/matmul_r10.c: Regenerated. + * generated/matmul_r16.c: Regenerated. + * generated/matmul_r4.c: Regenerated. + * generated/matmul_r8.c: Regenerated. + 2017-02-27 Janne Blomqvist * intrinsics/random.c (getosrandom): Don't try to use rand_s on diff --git a/libgfortran/generated/matmul_c10.c b/libgfortran/generated/matmul_c10.c index 2892c50693f..b333a844ea5 100644 --- a/libgfortran/generated/matmul_c10.c +++ b/libgfortran/generated/matmul_c10.c @@ -74,9 +74,6 @@ extern void matmul_c10 (gfc_array_c10 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c10); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_c10_avx (gfc_array_c10 * const restrict retarray, static void matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_c10_avx2 (gfc_array_c10 * const restrict retarray, gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_c10 (gfc_array_c10 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_c10_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_c16.c b/libgfortran/generated/matmul_c16.c index d67ec15a5f2..0ef66c0f4a2 100644 --- a/libgfortran/generated/matmul_c16.c +++ b/libgfortran/generated/matmul_c16.c @@ -74,9 +74,6 @@ extern void matmul_c16 (gfc_array_c16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c16); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_c16_avx (gfc_array_c16 * const restrict retarray, static void matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_c16_avx2 (gfc_array_c16 * const restrict retarray, gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_c16 (gfc_array_c16 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_c16_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c index b20539d6e0b..b30320b37ae 100644 --- a/libgfortran/generated/matmul_c4.c +++ b/libgfortran/generated/matmul_c4.c @@ -74,9 +74,6 @@ extern void matmul_c4 (gfc_array_c4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c4); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_c4_avx (gfc_array_c4 * const restrict retarray, static void matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_c4_avx2 (gfc_array_c4 * const restrict retarray, gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_c4 (gfc_array_c4 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_c4_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c index 954dd5ee064..75b4680c3a0 100644 --- a/libgfortran/generated/matmul_c8.c +++ b/libgfortran/generated/matmul_c8.c @@ -74,9 +74,6 @@ extern void matmul_c8 (gfc_array_c8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_c8); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_c8_avx (gfc_array_c8 * const restrict retarray, static void matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_c8_avx2 (gfc_array_c8 * const restrict retarray, gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_c8 (gfc_array_c8 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_c8_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_i1.c b/libgfortran/generated/matmul_i1.c index 621183671ee..924826338d8 100644 --- a/libgfortran/generated/matmul_i1.c +++ b/libgfortran/generated/matmul_i1.c @@ -74,9 +74,6 @@ extern void matmul_i1 (gfc_array_i1 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i1); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retarray, static void matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_i1_avx2 (gfc_array_i1 * const restrict retarray, gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_i1 (gfc_array_i1 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_i1_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_i16.c b/libgfortran/generated/matmul_i16.c index 51e92fb84c6..f10540ed48a 100644 --- a/libgfortran/generated/matmul_i16.c +++ b/libgfortran/generated/matmul_i16.c @@ -74,9 +74,6 @@ extern void matmul_i16 (gfc_array_i16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i16); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict retarray, static void matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_i16_avx2 (gfc_array_i16 * const restrict retarray, gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_i16 (gfc_array_i16 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_i16_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_i2.c b/libgfortran/generated/matmul_i2.c index 2077db3ed26..55ad5c614e6 100644 --- a/libgfortran/generated/matmul_i2.c +++ b/libgfortran/generated/matmul_i2.c @@ -74,9 +74,6 @@ extern void matmul_i2 (gfc_array_i2 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i2); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retarray, static void matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_i2_avx2 (gfc_array_i2 * const restrict retarray, gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_i2 (gfc_array_i2 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_i2_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c index 053f4a5263c..97b4a5b6aa0 100644 --- a/libgfortran/generated/matmul_i4.c +++ b/libgfortran/generated/matmul_i4.c @@ -74,9 +74,6 @@ extern void matmul_i4 (gfc_array_i4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i4); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retarray, static void matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_i4_avx2 (gfc_array_i4 * const restrict retarray, gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_i4 (gfc_array_i4 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_i4_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c index a3418d8a6a3..ae78ecfccb6 100644 --- a/libgfortran/generated/matmul_i8.c +++ b/libgfortran/generated/matmul_i8.c @@ -74,9 +74,6 @@ extern void matmul_i8 (gfc_array_i8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_i8); - - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -628,7 +625,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retarray, static void matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_i8_avx2 (gfc_array_i8 * const restrict retarray, gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, @@ -2277,7 +2274,8 @@ void matmul_i8 (gfc_array_i8 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_i8_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_r10.c b/libgfortran/generated/matmul_r10.c index 24a331a948c..11d059198c9 100644 --- a/libgfortran/generated/matmul_r10.c +++ b/libgfortran/generated/matmul_r10.c @@ -74,13 +74,6 @@ extern void matmul_r10 (gfc_array_r10 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r10); -#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -632,7 +625,7 @@ matmul_r10_avx (gfc_array_r10 * const restrict retarray, static void matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_r10_avx2 (gfc_array_r10 * const restrict retarray, gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, @@ -2281,7 +2274,8 @@ void matmul_r10 (gfc_array_r10 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_r10_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_r16.c b/libgfortran/generated/matmul_r16.c index be3e8fd217f..73e7c9877ad 100644 --- a/libgfortran/generated/matmul_r16.c +++ b/libgfortran/generated/matmul_r16.c @@ -74,13 +74,6 @@ extern void matmul_r16 (gfc_array_r16 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r16); -#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -632,7 +625,7 @@ matmul_r16_avx (gfc_array_r16 * const restrict retarray, static void matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_r16_avx2 (gfc_array_r16 * const restrict retarray, gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, @@ -2281,7 +2274,8 @@ void matmul_r16 (gfc_array_r16 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_r16_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c index b7a40d30b5d..ac7306fe035 100644 --- a/libgfortran/generated/matmul_r4.c +++ b/libgfortran/generated/matmul_r4.c @@ -74,13 +74,6 @@ extern void matmul_r4 (gfc_array_r4 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r4); -#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -632,7 +625,7 @@ matmul_r4_avx (gfc_array_r4 * const restrict retarray, static void matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_r4_avx2 (gfc_array_r4 * const restrict retarray, gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, @@ -2281,7 +2274,8 @@ void matmul_r4 (gfc_array_r4 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_r4_avx2; goto tailcall; diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c index cffa4ecce90..8d2e784de60 100644 --- a/libgfortran/generated/matmul_r8.c +++ b/libgfortran/generated/matmul_r8.c @@ -74,13 +74,6 @@ extern void matmul_r8 (gfc_array_r8 * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_r8); -#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif - - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -632,7 +625,7 @@ matmul_r8_avx (gfc_array_r8 * const restrict retarray, static void matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static void matmul_r8_avx2 (gfc_array_r8 * const restrict retarray, gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, @@ -2281,7 +2274,8 @@ void matmul_r8 (gfc_array_r8 * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_r8_avx2; goto tailcall; diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4 index 27381293dba..812a7e7e571 100644 --- a/libgfortran/m4/matmul.m4 +++ b/libgfortran/m4/matmul.m4 @@ -75,14 +75,6 @@ extern void matmul_'rtype_code` ('rtype` * const restrict retarray, int blas_limit, blas_call gemm); export_proto(matmul_'rtype_code`); -'ifelse(rtype_letter,`r',dnl -`#if defined(HAVE_AVX) && defined(HAVE_AVX2) -/* REAL types generate identical code for AVX and AVX2. Only generate - an AVX2 function if we are dealing with integer. */ -#undef HAVE_AVX2 -#endif') -` - /* Put exhaustive list of possible architectures here here, ORed together. */ #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F) @@ -101,7 +93,7 @@ static' include(matmul_internal.m4)dnl `static void 'matmul_name` ('rtype` * const restrict retarray, 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, - int blas_limit, blas_call gemm) __attribute__((__target__("avx2"))); + int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma"))); static' include(matmul_internal.m4)dnl `#endif /* HAVE_AVX2 */ @@ -147,7 +139,8 @@ void matmul_'rtype_code` ('rtype` * const restrict retarray, #endif /* HAVE_AVX512F */ #ifdef HAVE_AVX2 - if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) + && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { matmul_p = matmul_'rtype_code`_avx2; goto tailcall;