re PR libfortran/78379 (Processor-specific versions for matmul)

2016-12-03  Thomas Koenig  <tkoenig@gcc.gnu.org>

        PR fortran/78379
        * config/i386/cpuinfo.c:  Move denums for processor vendors,
        processor type, processor subtypes and declaration of
        struct __processor_model into
        * config/i386/cpuinfo.h:  New header file.
        * Makefile.am:  Add dependence of m4/matmul_internal_m4 to
        mamtul files..
        * Makefile.in:  Regenerated.
        * acinclude.m4:  Check for AVX, AVX2 and AVX512F.
        * config.h.in:  Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F.
        * configure:  Regenerated.
        * configure.ac:  Use checks for AVX, AVX2 and AVX_512F.
        * m4/matmul_internal.m4:  New file. working part of matmul.m4.
        * m4/matmul.m4:  Implement architecture-specific switching
        for AVX, AVX2 and AVX512F by including matmul_internal.m4
        multiple times.
        * generated/matmul_c10.c: Regenerated.
        * generated/matmul_c16.c: Regenerated.
        * generated/matmul_c4.c: Regenerated.
        * generated/matmul_c8.c: Regenerated.
        * generated/matmul_i1.c: Regenerated.
        * generated/matmul_i16.c: Regenerated.
        * generated/matmul_i2.c: Regenerated.
        * generated/matmul_i4.c: Regenerated.
        * generated/matmul_i8.c: Regenerated.
        * generated/matmul_r10.c: Regenerated.
        * generated/matmul_r16.c: Regenerated.
        * generated/matmul_r4.c: Regenerated.
        * generated/matmul_r8.c: Regenerated.

From-SVN: r243219
This commit is contained in:
Thomas Koenig 2016-12-03 09:44:35 +00:00
parent 802583a210
commit 31cfd83286
25 changed files with 29983 additions and 613 deletions

View File

@ -1,3 +1,11 @@
2016-12-03 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/78379
* config/i386/cpuinfo.c: Move denums for processor vendors,
processor type, processor subtypes and declaration of
struct __processor_model into
* config/i386/cpuinfo.h: New header file.
2016-12-02 Andre Vieira <andre.simoesdiasvieira@arm.com>
Thomas Preud'homme <thomas.preudhomme@arm.com>

View File

@ -26,6 +26,7 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
#include "cpuid.h"
#include "tsystem.h"
#include "auto-target.h"
#include "cpuinfo.h"
#ifdef HAVE_INIT_PRIORITY
#define CONSTRUCTOR_PRIORITY (101)
@ -36,97 +37,8 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
int __cpu_indicator_init (void)
__attribute__ ((constructor CONSTRUCTOR_PRIORITY));
/* Processor Vendor and Models. */
enum processor_vendor
{
VENDOR_INTEL = 1,
VENDOR_AMD,
VENDOR_OTHER,
VENDOR_MAX
};
/* Any new types or subtypes have to be inserted at the end. */
enum processor_types
{
INTEL_BONNELL = 1,
INTEL_CORE2,
INTEL_COREI7,
AMDFAM10H,
AMDFAM15H,
INTEL_SILVERMONT,
INTEL_KNL,
AMD_BTVER1,
AMD_BTVER2,
AMDFAM17H,
CPU_TYPE_MAX
};
enum processor_subtypes
{
INTEL_COREI7_NEHALEM = 1,
INTEL_COREI7_WESTMERE,
INTEL_COREI7_SANDYBRIDGE,
AMDFAM10H_BARCELONA,
AMDFAM10H_SHANGHAI,
AMDFAM10H_ISTANBUL,
AMDFAM15H_BDVER1,
AMDFAM15H_BDVER2,
AMDFAM15H_BDVER3,
AMDFAM15H_BDVER4,
AMDFAM17H_ZNVER1,
INTEL_COREI7_IVYBRIDGE,
INTEL_COREI7_HASWELL,
INTEL_COREI7_BROADWELL,
INTEL_COREI7_SKYLAKE,
INTEL_COREI7_SKYLAKE_AVX512,
CPU_SUBTYPE_MAX
};
/* ISA Features supported. New features have to be inserted at the end. */
enum processor_features
{
FEATURE_CMOV = 0,
FEATURE_MMX,
FEATURE_POPCNT,
FEATURE_SSE,
FEATURE_SSE2,
FEATURE_SSE3,
FEATURE_SSSE3,
FEATURE_SSE4_1,
FEATURE_SSE4_2,
FEATURE_AVX,
FEATURE_AVX2,
FEATURE_SSE4_A,
FEATURE_FMA4,
FEATURE_XOP,
FEATURE_FMA,
FEATURE_AVX512F,
FEATURE_BMI,
FEATURE_BMI2,
FEATURE_AES,
FEATURE_PCLMUL,
FEATURE_AVX512VL,
FEATURE_AVX512BW,
FEATURE_AVX512DQ,
FEATURE_AVX512CD,
FEATURE_AVX512ER,
FEATURE_AVX512PF,
FEATURE_AVX512VBMI,
FEATURE_AVX512IFMA,
FEATURE_AVX5124VNNIW,
FEATURE_AVX5124FMAPS
};
struct __processor_model
{
unsigned int __cpu_vendor;
unsigned int __cpu_type;
unsigned int __cpu_subtype;
unsigned int __cpu_features[1];
} __cpu_model = { };
struct __processor_model __cpu_model = { };
/* Get the specific type of AMD CPU. */

View File

@ -0,0 +1,116 @@
/* Get CPU type and Features for x86 processors.
Copyright (C) 2012-2016 Free Software Foundation, Inc.
Contributed by Sriraman Tallam (tmsriram@google.com)
This file is part of GCC.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* Processor Vendor and Models. */
enum processor_vendor
{
VENDOR_INTEL = 1,
VENDOR_AMD,
VENDOR_OTHER,
VENDOR_MAX
};
/* Any new types or subtypes have to be inserted at the end. */
enum processor_types
{
INTEL_BONNELL = 1,
INTEL_CORE2,
INTEL_COREI7,
AMDFAM10H,
AMDFAM15H,
INTEL_SILVERMONT,
INTEL_KNL,
AMD_BTVER1,
AMD_BTVER2,
AMDFAM17H,
CPU_TYPE_MAX
};
enum processor_subtypes
{
INTEL_COREI7_NEHALEM = 1,
INTEL_COREI7_WESTMERE,
INTEL_COREI7_SANDYBRIDGE,
AMDFAM10H_BARCELONA,
AMDFAM10H_SHANGHAI,
AMDFAM10H_ISTANBUL,
AMDFAM15H_BDVER1,
AMDFAM15H_BDVER2,
AMDFAM15H_BDVER3,
AMDFAM15H_BDVER4,
AMDFAM17H_ZNVER1,
INTEL_COREI7_IVYBRIDGE,
INTEL_COREI7_HASWELL,
INTEL_COREI7_BROADWELL,
INTEL_COREI7_SKYLAKE,
INTEL_COREI7_SKYLAKE_AVX512,
CPU_SUBTYPE_MAX
};
/* ISA Features supported. New features have to be inserted at the end. */
enum processor_features
{
FEATURE_CMOV = 0,
FEATURE_MMX,
FEATURE_POPCNT,
FEATURE_SSE,
FEATURE_SSE2,
FEATURE_SSE3,
FEATURE_SSSE3,
FEATURE_SSE4_1,
FEATURE_SSE4_2,
FEATURE_AVX,
FEATURE_AVX2,
FEATURE_SSE4_A,
FEATURE_FMA4,
FEATURE_XOP,
FEATURE_FMA,
FEATURE_AVX512F,
FEATURE_BMI,
FEATURE_BMI2,
FEATURE_AES,
FEATURE_PCLMUL,
FEATURE_AVX512VL,
FEATURE_AVX512BW,
FEATURE_AVX512DQ,
FEATURE_AVX512CD,
FEATURE_AVX512ER,
FEATURE_AVX512PF,
FEATURE_AVX512VBMI,
FEATURE_AVX512IFMA,
FEATURE_AVX5124VNNIW,
FEATURE_AVX5124FMAPS
};
extern struct __processor_model
{
unsigned int __cpu_vendor;
unsigned int __cpu_type;
unsigned int __cpu_subtype;
unsigned int __cpu_features[1];
} __cpu_model;

View File

@ -1,3 +1,31 @@
2016-12-03 Thomas Koenig <tkoenig@gcc.gnu.org>
PR fortran/78379
* Makefile.am: Add dependence of m4/matmul_internal_m4 to
mamtul files..
* Makefile.in: Regenerated.
* acinclude.m4: Check for AVX, AVX2 and AVX512F.
* config.h.in: Add HAVE_AVX, HAVE_AVX2 and HAVE_AVX512F.
* configure: Regenerated.
* configure.ac: Use checks for AVX, AVX2 and AVX_512F.
* m4/matmul_internal.m4: New file. working part of matmul.m4.
* m4/matmul.m4: Implement architecture-specific switching
for AVX, AVX2 and AVX512F by including matmul_internal.m4
multiple times.
* generated/matmul_c10.c: Regenerated.
* generated/matmul_c16.c: Regenerated.
* generated/matmul_c4.c: Regenerated.
* generated/matmul_c8.c: Regenerated.
* generated/matmul_i1.c: Regenerated.
* generated/matmul_i16.c: Regenerated.
* generated/matmul_i2.c: Regenerated.
* generated/matmul_i4.c: Regenerated.
* generated/matmul_i8.c: Regenerated.
* generated/matmul_r10.c: Regenerated.
* generated/matmul_r16.c: Regenerated.
* generated/matmul_r4.c: Regenerated.
* generated/matmul_r8.c: Regenerated.
2016-11-30 Andre Vehreschild <vehre@gcc.gnu.org>
* caf/single.c (_gfortran_caf_get_by_ref): Prevent compile time

View File

@ -987,7 +987,7 @@ $(i_product_c): m4/product.m4 $(I_M4_DEPS1)
$(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
$(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@
$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS)
$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
$(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)

View File

@ -6053,7 +6053,7 @@ fpu-target.inc: fpu-target.h $(srcdir)/libgfortran.h
@MAINTAINER_MODE_TRUE@$(i_sum_c): m4/sum.m4 $(I_M4_DEPS1)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 sum.m4 > $@
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@$(i_matmul_c): m4/matmul.m4 m4/matmul_internal.m4 $(I_M4_DEPS)
@MAINTAINER_MODE_TRUE@ $(M4) -Dfile=$@ -I$(srcdir)/m4 matmul.m4 > $@
@MAINTAINER_MODE_TRUE@$(i_matmull_c): m4/matmull.m4 $(I_M4_DEPS)

View File

@ -393,3 +393,54 @@ AC_DEFUN([LIBGFOR_CHECK_STRERROR_R], [
[Define if strerror_r takes two arguments and is available in <string.h>.]),)
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for AVX
AC_DEFUN([LIBGFOR_CHECK_AVX], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
void _mm256_zeroall (void)
{
__builtin_ia32_vzeroall ();
}]], [[]])],
AC_DEFINE(HAVE_AVX, 1,
[Define if AVX instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for AVX2
AC_DEFUN([LIBGFOR_CHECK_AVX2], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx2"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
__v4di
mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
{
return __builtin_ia32_andnotsi256 (__X, __Y);
}]], [[]])],
AC_DEFINE(HAVE_AVX2, 1,
[Define if AVX2 instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])
dnl Check for AVX512f
AC_DEFUN([LIBGFOR_CHECK_AVX512F], [
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx512f"
AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
typedef double __m512d __attribute__ ((__vector_size__ (64)));
__m512d _mm512_add (__m512d a)
{
return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
}]], [[]])],
AC_DEFINE(HAVE_AVX512F, 1,
[Define if AVX512f instructions can be compiled.]),
[])
CFLAGS="$ac_save_CFLAGS"
])

View File

@ -78,6 +78,15 @@
/* Define to 1 if the target supports __attribute__((visibility(...))). */
#undef HAVE_ATTRIBUTE_VISIBILITY
/* Define if AVX instructions can be compiled. */
#undef HAVE_AVX
/* Define if AVX2 instructions can be compiled. */
#undef HAVE_AVX2
/* Define if AVX512f instructions can be compiled. */
#undef HAVE_AVX512F
/* Define to 1 if you have the `cabs' function. */
#undef HAVE_CABS

87
libgfortran/configure vendored
View File

@ -26174,6 +26174,93 @@ $as_echo "#define HAVE_CRLF 1" >>confdefs.h
fi
# Check whether we support AVX extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
void _mm256_zeroall (void)
{
__builtin_ia32_vzeroall ();
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_AVX 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check wether we support AVX2 extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx2"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
__v4di
mm256_is32_andnotsi256 (__v4di __X, __v4di __Y)
{
return __builtin_ia32_andnotsi256 (__X, __Y);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_AVX2 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
# Check wether we support AVX512f extensions
ac_save_CFLAGS="$CFLAGS"
CFLAGS="-O2 -mavx512f"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
typedef double __m512d __attribute__ ((__vector_size__ (64)));
__m512d _mm512_add (__m512d a)
{
return __builtin_ia32_addpd512_mask (a, a, a, 1, 4);
}
int
main ()
{
;
return 0;
}
_ACEOF
if ac_fn_c_try_compile "$LINENO"; then :
$as_echo "#define HAVE_AVX512F 1" >>confdefs.h
fi
rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
CFLAGS="$ac_save_CFLAGS"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
# tests run on this system so they can be shared between configure

View File

@ -609,6 +609,15 @@ LIBGFOR_CHECK_UNLINK_OPEN_FILE
# Check whether line terminator is LF or CRLF
LIBGFOR_CHECK_CRLF
# Check whether we support AVX extensions
LIBGFOR_CHECK_AVX
# Check wether we support AVX2 extensions
LIBGFOR_CHECK_AVX2
# Check wether we support AVX512f extensions
LIBGFOR_CHECK_AVX512F
AC_CACHE_SAVE
if test ${multilib} = yes; then

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -76,537 +76,105 @@ extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
int blas_limit, blas_call gemm);
export_proto(matmul_'rtype_code`);
void
matmul_'rtype_code` ('rtype` * const restrict retarray,
'ifelse(rtype_letter,`r',dnl
`#if defined(HAVE_AVX) && defined(HAVE_AVX2)
/* REAL types generate identical code for AVX and AVX2. Only generate
an AVX2 function if we are dealing with integer. */
#undef HAVE_AVX2
#endif')
`
/* Put exhaustive list of possible architectures here here, ORed together. */
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
#ifdef HAVE_AVX
'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
`static void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX */
#ifdef HAVE_AVX2
'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
`static void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx2")));
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX2 */
#ifdef HAVE_AVX512F
'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
`static void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
static' include(matmul_internal.m4)dnl
`#endif /* HAVE_AVX512F */
/* Function to fall back to if there is no special processor-specific version. */
'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
`static' include(matmul_internal.m4)dnl
`/* Compiling main function, with selection code for the processor. */
/* Currently, this is i386 only. Adjust for other architectures. */
#include <config/i386/cpuinfo.h>
void matmul_'rtype_code` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const 'rtype_name` * restrict abase;
const 'rtype_name` * restrict bbase;
'rtype_name` * restrict dest;
static void (*matmul_p) ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm) = NULL;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
assert (GFC_DESCRIPTOR_RANK (a) == 2
|| GFC_DESCRIPTOR_RANK (b) == 2);
/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
Either A or B (but not both) can be rank 1:
o One-dimensional argument A is implicitly treated as a row matrix
dimensioned [1,count], so xcount=1.
o One-dimensional argument B is implicitly treated as a column matrix
dimensioned [count, 1], so ycount=1.
*/
if (retarray->base_addr == NULL)
if (matmul_p == NULL)
{
if (GFC_DESCRIPTOR_RANK (a) == 1)
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
}
else if (GFC_DESCRIPTOR_RANK (b) == 1)
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
}
else
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
GFC_DIMENSION_SET(retarray->dim[1], 0,
GFC_DESCRIPTOR_EXTENT(b,1) - 1,
GFC_DESCRIPTOR_EXTENT(retarray,0));
}
retarray->base_addr
= xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
{
index_type ret_extent, arg_extent;
if (GFC_DESCRIPTOR_RANK (a) == 1)
matmul_p = matmul_'rtype_code`_vanilla;
if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
{
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic: is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
else if (GFC_DESCRIPTOR_RANK (b) == 1)
{
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic: is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
else
{
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic for dimension 1:"
" is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic for dimension 2:"
" is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
}
'
sinclude(`matmul_asm_'rtype_code`.m4')dnl
`
if (GFC_DESCRIPTOR_RANK (retarray) == 1)
{
/* One-dimensional result may be addressed in the code below
either as a row or a column matrix. We want both cases to
work. */
rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
}
else
{
rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
}
if (GFC_DESCRIPTOR_RANK (a) == 1)
{
/* Treat it as a a row matrix A[1,count]. */
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
aystride = 1;
xcount = 1;
count = GFC_DESCRIPTOR_EXTENT(a,0);
}
else
{
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
aystride = GFC_DESCRIPTOR_STRIDE(a,1);
count = GFC_DESCRIPTOR_EXTENT(a,1);
xcount = GFC_DESCRIPTOR_EXTENT(a,0);
}
if (count != GFC_DESCRIPTOR_EXTENT(b,0))
{
if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
}
if (GFC_DESCRIPTOR_RANK (b) == 1)
{
/* Treat it as a column matrix B[count,1] */
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
/* bystride should never be used for 1-dimensional b.
in case it is we want it to cause a segfault, rather than
an incorrect result. */
bystride = 0xDEADBEEF;
ycount = 1;
}
else
{
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
bystride = GFC_DESCRIPTOR_STRIDE(b,1);
ycount = GFC_DESCRIPTOR_EXTENT(b,1);
}
abase = a->base_addr;
bbase = b->base_addr;
dest = retarray->base_addr;
/* Now that everything is set up, we perform the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const 'rtype_name` one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
&n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
&ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
/* This block of code implements a tuned matmul, derived from
Superscalar GEMM-based level 3 BLAS, Beta version 0.1
Bo Kagstrom and Per Ling
Department of Computing Science
Umea University
S-901 87 Umea, Sweden
from netlib.org, translated to C, and modified for matmul.m4. */
const 'rtype_name` *a, *b;
'rtype_name` *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
i1, i2, i3, i4, i5, i6;
/* Local variables */
'rtype_name` t1[65536], /* was [256][256] */
f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
a = abase;
b = bbase;
c = retarray->base_addr;
/* Parameter adjustments */
c_dim1 = rystride;
c_offset = 1 + c_dim1;
c -= c_offset;
a_dim1 = aystride;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = bystride;
b_offset = 1 + b_dim1;
b -= b_offset;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
return;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
c[i + j * c_dim1] = ('rtype_name`)0;
/* Start turning the crank. */
i1 = n;
for (jj = 1; jj <= i1; jj += 512)
{
/* Computing MIN */
i2 = 512;
i3 = n - jj + 1;
jsec = min(i2,i3);
ujsec = jsec - jsec % 4;
i2 = k;
for (ll = 1; ll <= i2; ll += 256)
/* Run down the available processors in order of preference. */
#ifdef HAVE_AVX512F
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
{
/* Computing MIN */
i3 = 256;
i4 = k - ll + 1;
lsec = min(i3,i4);
ulsec = lsec - lsec % 2;
i3 = m;
for (ii = 1; ii <= i3; ii += 256)
{
/* Computing MIN */
i4 = 256;
i5 = m - ii + 1;
isec = min(i4,i5);
uisec = isec - isec % 2;
i4 = ll + ulsec - 1;
for (l = ll; l <= i4; l += 2)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 2)
{
t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
a[i + l * a_dim1];
t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
a[i + (l + 1) * a_dim1];
t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
a[i + 1 + l * a_dim1];
t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
a[i + 1 + (l + 1) * a_dim1];
}
if (uisec < isec)
{
t1[l - ll + 1 + (isec << 8) - 257] =
a[ii + isec - 1 + l * a_dim1];
t1[l - ll + 2 + (isec << 8) - 257] =
a[ii + isec - 1 + (l + 1) * a_dim1];
}
}
if (ulsec < lsec)
{
i4 = ii + isec - 1;
for (i = ii; i<= i4; ++i)
{
t1[lsec + ((i - ii + 1) << 8) - 257] =
a[i + (ll + lsec - 1) * a_dim1];
}
}
uisec = isec - isec % 4;
i4 = jj + ujsec - 1;
for (j = jj; j <= i4; j += 4)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 4)
{
f11 = c[i + j * c_dim1];
f21 = c[i + 1 + j * c_dim1];
f12 = c[i + (j + 1) * c_dim1];
f22 = c[i + 1 + (j + 1) * c_dim1];
f13 = c[i + (j + 2) * c_dim1];
f23 = c[i + 1 + (j + 2) * c_dim1];
f14 = c[i + (j + 3) * c_dim1];
f24 = c[i + 1 + (j + 3) * c_dim1];
f31 = c[i + 2 + j * c_dim1];
f41 = c[i + 3 + j * c_dim1];
f32 = c[i + 2 + (j + 1) * c_dim1];
f42 = c[i + 3 + (j + 1) * c_dim1];
f33 = c[i + 2 + (j + 2) * c_dim1];
f43 = c[i + 3 + (j + 2) * c_dim1];
f34 = c[i + 2 + (j + 3) * c_dim1];
f44 = c[i + 3 + (j + 3) * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + j * b_dim1];
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + j * b_dim1];
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + j * b_dim1];
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + j * b_dim1];
f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 3) * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + 1 + j * c_dim1] = f21;
c[i + (j + 1) * c_dim1] = f12;
c[i + 1 + (j + 1) * c_dim1] = f22;
c[i + (j + 2) * c_dim1] = f13;
c[i + 1 + (j + 2) * c_dim1] = f23;
c[i + (j + 3) * c_dim1] = f14;
c[i + 1 + (j + 3) * c_dim1] = f24;
c[i + 2 + j * c_dim1] = f31;
c[i + 3 + j * c_dim1] = f41;
c[i + 2 + (j + 1) * c_dim1] = f32;
c[i + 3 + (j + 1) * c_dim1] = f42;
c[i + 2 + (j + 2) * c_dim1] = f33;
c[i + 3 + (j + 2) * c_dim1] = f43;
c[i + 2 + (j + 3) * c_dim1] = f34;
c[i + 3 + (j + 3) * c_dim1] = f44;
}
if (uisec < isec)
{
i5 = ii + isec - 1;
for (i = ii + uisec; i <= i5; ++i)
{
f11 = c[i + j * c_dim1];
f12 = c[i + (j + 1) * c_dim1];
f13 = c[i + (j + 2) * c_dim1];
f14 = c[i + (j + 3) * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 1) * b_dim1];
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 2) * b_dim1];
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 3) * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + (j + 1) * c_dim1] = f12;
c[i + (j + 2) * c_dim1] = f13;
c[i + (j + 3) * c_dim1] = f14;
}
}
}
if (ujsec < jsec)
{
i4 = jj + jsec - 1;
for (j = jj + ujsec; j <= i4; ++j)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 4)
{
f11 = c[i + j * c_dim1];
f21 = c[i + 1 + j * c_dim1];
f31 = c[i + 2 + j * c_dim1];
f41 = c[i + 3 + j * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
257] * b[l + j * b_dim1];
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
257] * b[l + j * b_dim1];
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
257] * b[l + j * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + 1 + j * c_dim1] = f21;
c[i + 2 + j * c_dim1] = f31;
c[i + 3 + j * c_dim1] = f41;
}
i5 = ii + isec - 1;
for (i = ii + uisec; i <= i5; ++i)
{
f11 = c[i + j * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
}
c[i + j * c_dim1] = f11;
}
}
}
}
matmul_p = matmul_'rtype_code`_avx512f;
goto tailcall;
}
}
return;
}
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
const 'rtype_name` *restrict abase_x;
const 'rtype_name` *restrict bbase_y;
'rtype_name` *restrict dest_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
#endif /* HAVE_AVX512F */
#ifdef HAVE_AVX2
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
matmul_p = matmul_'rtype_code`_avx2;
goto tailcall;
}
}
else
{
const 'rtype_name` *restrict bbase_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
dest[x*rxstride + y*rystride] = ('rtype_name`)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] +=
abase[x*axstride + n*aystride] *
bbase[n*bxstride + y*bystride];
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
const 'rtype_name` *restrict bbase_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
}
}
else
{
const 'rtype_name` *restrict abase_x;
const 'rtype_name` *restrict bbase_y;
'rtype_name` *restrict dest_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}'
#endif
#ifdef HAVE_AVX
if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
{
matmul_p = matmul_'rtype_code`_avx;
goto tailcall;
}
#endif /* HAVE_AVX */
}
}
tailcall:
(*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm);
}
#else /* Just the vanilla function. */
'define(`matmul_name',`matmul_'rtype_code)dnl
define(`target_attribute',`')dnl
include(matmul_internal.m4)dnl
`#endif
#endif
'

View File

@ -0,0 +1,537 @@
`void
'matmul_name` ('rtype` * const restrict retarray,
'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const 'rtype_name` * restrict abase;
const 'rtype_name` * restrict bbase;
'rtype_name` * restrict dest;
index_type rxstride, rystride, axstride, aystride, bxstride, bystride;
index_type x, y, n, count, xcount, ycount;
assert (GFC_DESCRIPTOR_RANK (a) == 2
|| GFC_DESCRIPTOR_RANK (b) == 2);
/* C[xcount,ycount] = A[xcount, count] * B[count,ycount]
Either A or B (but not both) can be rank 1:
o One-dimensional argument A is implicitly treated as a row matrix
dimensioned [1,count], so xcount=1.
o One-dimensional argument B is implicitly treated as a column matrix
dimensioned [count, 1], so ycount=1.
*/
if (retarray->base_addr == NULL)
{
if (GFC_DESCRIPTOR_RANK (a) == 1)
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(b,1) - 1, 1);
}
else if (GFC_DESCRIPTOR_RANK (b) == 1)
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
}
else
{
GFC_DIMENSION_SET(retarray->dim[0], 0,
GFC_DESCRIPTOR_EXTENT(a,0) - 1, 1);
GFC_DIMENSION_SET(retarray->dim[1], 0,
GFC_DESCRIPTOR_EXTENT(b,1) - 1,
GFC_DESCRIPTOR_EXTENT(retarray,0));
}
retarray->base_addr
= xmallocarray (size0 ((array_t *) retarray), sizeof ('rtype_name`));
retarray->offset = 0;
}
else if (unlikely (compile_options.bounds_check))
{
index_type ret_extent, arg_extent;
if (GFC_DESCRIPTOR_RANK (a) == 1)
{
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic: is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
else if (GFC_DESCRIPTOR_RANK (b) == 1)
{
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic: is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
else
{
arg_extent = GFC_DESCRIPTOR_EXTENT(a,0);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,0);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic for dimension 1:"
" is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
arg_extent = GFC_DESCRIPTOR_EXTENT(b,1);
ret_extent = GFC_DESCRIPTOR_EXTENT(retarray,1);
if (arg_extent != ret_extent)
runtime_error ("Incorrect extent in return array in"
" MATMUL intrinsic for dimension 2:"
" is %ld, should be %ld",
(long int) ret_extent, (long int) arg_extent);
}
}
'
sinclude(`matmul_asm_'rtype_code`.m4')dnl
`
if (GFC_DESCRIPTOR_RANK (retarray) == 1)
{
/* One-dimensional result may be addressed in the code below
either as a row or a column matrix. We want both cases to
work. */
rxstride = rystride = GFC_DESCRIPTOR_STRIDE(retarray,0);
}
else
{
rxstride = GFC_DESCRIPTOR_STRIDE(retarray,0);
rystride = GFC_DESCRIPTOR_STRIDE(retarray,1);
}
if (GFC_DESCRIPTOR_RANK (a) == 1)
{
/* Treat it as a a row matrix A[1,count]. */
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
aystride = 1;
xcount = 1;
count = GFC_DESCRIPTOR_EXTENT(a,0);
}
else
{
axstride = GFC_DESCRIPTOR_STRIDE(a,0);
aystride = GFC_DESCRIPTOR_STRIDE(a,1);
count = GFC_DESCRIPTOR_EXTENT(a,1);
xcount = GFC_DESCRIPTOR_EXTENT(a,0);
}
if (count != GFC_DESCRIPTOR_EXTENT(b,0))
{
if (count > 0 || GFC_DESCRIPTOR_EXTENT(b,0) > 0)
runtime_error ("dimension of array B incorrect in MATMUL intrinsic");
}
if (GFC_DESCRIPTOR_RANK (b) == 1)
{
/* Treat it as a column matrix B[count,1] */
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
/* bystride should never be used for 1-dimensional b.
in case it is we want it to cause a segfault, rather than
an incorrect result. */
bystride = 0xDEADBEEF;
ycount = 1;
}
else
{
bxstride = GFC_DESCRIPTOR_STRIDE(b,0);
bystride = GFC_DESCRIPTOR_STRIDE(b,1);
ycount = GFC_DESCRIPTOR_EXTENT(b,1);
}
abase = a->base_addr;
bbase = b->base_addr;
dest = retarray->base_addr;
/* Now that everything is set up, we perform the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
#define min(a,b) ((a) <= (b) ? (a) : (b))
#define max(a,b) ((a) >= (b) ? (a) : (b))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const 'rtype_name` one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m,
&n, &k, &one, abase, &lda, bbase, &ldb, &zero, dest,
&ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
/* This block of code implements a tuned matmul, derived from
Superscalar GEMM-based level 3 BLAS, Beta version 0.1
Bo Kagstrom and Per Ling
Department of Computing Science
Umea University
S-901 87 Umea, Sweden
from netlib.org, translated to C, and modified for matmul.m4. */
const 'rtype_name` *a, *b;
'rtype_name` *c;
const index_type m = xcount, n = ycount, k = count;
/* System generated locals */
index_type a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset,
i1, i2, i3, i4, i5, i6;
/* Local variables */
'rtype_name` t1[65536], /* was [256][256] */
f11, f12, f21, f22, f31, f32, f41, f42,
f13, f14, f23, f24, f33, f34, f43, f44;
index_type i, j, l, ii, jj, ll;
index_type isec, jsec, lsec, uisec, ujsec, ulsec;
a = abase;
b = bbase;
c = retarray->base_addr;
/* Parameter adjustments */
c_dim1 = rystride;
c_offset = 1 + c_dim1;
c -= c_offset;
a_dim1 = aystride;
a_offset = 1 + a_dim1;
a -= a_offset;
b_dim1 = bystride;
b_offset = 1 + b_dim1;
b -= b_offset;
/* Early exit if possible */
if (m == 0 || n == 0 || k == 0)
return;
/* Empty c first. */
for (j=1; j<=n; j++)
for (i=1; i<=m; i++)
c[i + j * c_dim1] = ('rtype_name`)0;
/* Start turning the crank. */
i1 = n;
for (jj = 1; jj <= i1; jj += 512)
{
/* Computing MIN */
i2 = 512;
i3 = n - jj + 1;
jsec = min(i2,i3);
ujsec = jsec - jsec % 4;
i2 = k;
for (ll = 1; ll <= i2; ll += 256)
{
/* Computing MIN */
i3 = 256;
i4 = k - ll + 1;
lsec = min(i3,i4);
ulsec = lsec - lsec % 2;
i3 = m;
for (ii = 1; ii <= i3; ii += 256)
{
/* Computing MIN */
i4 = 256;
i5 = m - ii + 1;
isec = min(i4,i5);
uisec = isec - isec % 2;
i4 = ll + ulsec - 1;
for (l = ll; l <= i4; l += 2)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 2)
{
t1[l - ll + 1 + ((i - ii + 1) << 8) - 257] =
a[i + l * a_dim1];
t1[l - ll + 2 + ((i - ii + 1) << 8) - 257] =
a[i + (l + 1) * a_dim1];
t1[l - ll + 1 + ((i - ii + 2) << 8) - 257] =
a[i + 1 + l * a_dim1];
t1[l - ll + 2 + ((i - ii + 2) << 8) - 257] =
a[i + 1 + (l + 1) * a_dim1];
}
if (uisec < isec)
{
t1[l - ll + 1 + (isec << 8) - 257] =
a[ii + isec - 1 + l * a_dim1];
t1[l - ll + 2 + (isec << 8) - 257] =
a[ii + isec - 1 + (l + 1) * a_dim1];
}
}
if (ulsec < lsec)
{
i4 = ii + isec - 1;
for (i = ii; i<= i4; ++i)
{
t1[lsec + ((i - ii + 1) << 8) - 257] =
a[i + (ll + lsec - 1) * a_dim1];
}
}
uisec = isec - isec % 4;
i4 = jj + ujsec - 1;
for (j = jj; j <= i4; j += 4)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 4)
{
f11 = c[i + j * c_dim1];
f21 = c[i + 1 + j * c_dim1];
f12 = c[i + (j + 1) * c_dim1];
f22 = c[i + 1 + (j + 1) * c_dim1];
f13 = c[i + (j + 2) * c_dim1];
f23 = c[i + 1 + (j + 2) * c_dim1];
f14 = c[i + (j + 3) * c_dim1];
f24 = c[i + 1 + (j + 3) * c_dim1];
f31 = c[i + 2 + j * c_dim1];
f41 = c[i + 3 + j * c_dim1];
f32 = c[i + 2 + (j + 1) * c_dim1];
f42 = c[i + 3 + (j + 1) * c_dim1];
f33 = c[i + 2 + (j + 2) * c_dim1];
f43 = c[i + 3 + (j + 2) * c_dim1];
f34 = c[i + 2 + (j + 3) * c_dim1];
f44 = c[i + 3 + (j + 3) * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + j * b_dim1];
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + j * b_dim1];
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f22 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f23 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f24 += t1[l - ll + 1 + ((i - ii + 2) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + j * b_dim1];
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + j * b_dim1];
f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f42 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 1) * b_dim1];
f33 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f43 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 2) * b_dim1];
f34 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257]
* b[l + (j + 3) * b_dim1];
f44 += t1[l - ll + 1 + ((i - ii + 4) << 8) - 257]
* b[l + (j + 3) * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + 1 + j * c_dim1] = f21;
c[i + (j + 1) * c_dim1] = f12;
c[i + 1 + (j + 1) * c_dim1] = f22;
c[i + (j + 2) * c_dim1] = f13;
c[i + 1 + (j + 2) * c_dim1] = f23;
c[i + (j + 3) * c_dim1] = f14;
c[i + 1 + (j + 3) * c_dim1] = f24;
c[i + 2 + j * c_dim1] = f31;
c[i + 3 + j * c_dim1] = f41;
c[i + 2 + (j + 1) * c_dim1] = f32;
c[i + 3 + (j + 1) * c_dim1] = f42;
c[i + 2 + (j + 2) * c_dim1] = f33;
c[i + 3 + (j + 2) * c_dim1] = f43;
c[i + 2 + (j + 3) * c_dim1] = f34;
c[i + 3 + (j + 3) * c_dim1] = f44;
}
if (uisec < isec)
{
i5 = ii + isec - 1;
for (i = ii + uisec; i <= i5; ++i)
{
f11 = c[i + j * c_dim1];
f12 = c[i + (j + 1) * c_dim1];
f13 = c[i + (j + 2) * c_dim1];
f14 = c[i + (j + 3) * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
f12 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 1) * b_dim1];
f13 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 2) * b_dim1];
f14 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + (j + 3) * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + (j + 1) * c_dim1] = f12;
c[i + (j + 2) * c_dim1] = f13;
c[i + (j + 3) * c_dim1] = f14;
}
}
}
if (ujsec < jsec)
{
i4 = jj + jsec - 1;
for (j = jj + ujsec; j <= i4; ++j)
{
i5 = ii + uisec - 1;
for (i = ii; i <= i5; i += 4)
{
f11 = c[i + j * c_dim1];
f21 = c[i + 1 + j * c_dim1];
f31 = c[i + 2 + j * c_dim1];
f41 = c[i + 3 + j * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
f21 += t1[l - ll + 1 + ((i - ii + 2) << 8) -
257] * b[l + j * b_dim1];
f31 += t1[l - ll + 1 + ((i - ii + 3) << 8) -
257] * b[l + j * b_dim1];
f41 += t1[l - ll + 1 + ((i - ii + 4) << 8) -
257] * b[l + j * b_dim1];
}
c[i + j * c_dim1] = f11;
c[i + 1 + j * c_dim1] = f21;
c[i + 2 + j * c_dim1] = f31;
c[i + 3 + j * c_dim1] = f41;
}
i5 = ii + isec - 1;
for (i = ii + uisec; i <= i5; ++i)
{
f11 = c[i + j * c_dim1];
i6 = ll + lsec - 1;
for (l = ll; l <= i6; ++l)
{
f11 += t1[l - ll + 1 + ((i - ii + 1) << 8) -
257] * b[l + j * b_dim1];
}
c[i + j * c_dim1] = f11;
}
}
}
}
}
}
return;
}
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
if (GFC_DESCRIPTOR_RANK (a) != 1)
{
const 'rtype_name` *restrict abase_x;
const 'rtype_name` *restrict bbase_y;
'rtype_name` *restrict dest_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else
{
const 'rtype_name` *restrict bbase_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n];
dest[y*rystride] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
dest[x*rxstride + y*rystride] = ('rtype_name`)0;
for (y = 0; y < ycount; y++)
for (n = 0; n < count; n++)
for (x = 0; x < xcount; x++)
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] +=
abase[x*axstride + n*aystride] *
bbase[n*bxstride + y*bystride];
}
else if (GFC_DESCRIPTOR_RANK (a) == 1)
{
const 'rtype_name` *restrict bbase_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase[n*axstride] * bbase_y[n*bxstride];
dest[y*rxstride] = s;
}
}
else
{
const 'rtype_name` *restrict abase_x;
const 'rtype_name` *restrict bbase_y;
'rtype_name` *restrict dest_y;
'rtype_name` s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = ('rtype_name`) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#undef POW3
#undef min
#undef max
'