re PR fortran/26025 (Optionally use BLAS for matmul)

PR fortran/26025

	* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
	* options.c (gfc_init_options): Initialize new flags.
	(gfc_handle_option): Handle new flags.
	* gfortran.h (gfc_option): Add flag_external_blas and
	blas_matmul_limit flags.
	* trans-expr.c (gfc_conv_function_call): Use new argument
	append_args, appending it at the end of the argument list
	built for a function call.
	* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
	append_args argument to gfc_trans_call.
	* trans.h (gfc_conv_function_call): Update prototype.
	* trans-decl.c (gfc_build_intrinsic_function_decls): Add
	prototypes for BLAS ?gemm routines.
	* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
	extra arguments given to the library matmul function, and give
	them to gfc_conv_function_call.
	* invoke.texi: Add documentation for -fexternal-blas and
	-fblas-matmul-limit.

	* m4/matmul.m4: Add possible call to gemm routine.
	* generated/matmul_r8.c: Regenerate.
	* generated/matmul_r16.c: Regenerate.
	* generated/matmul_c8.c: Regenerate.
	* generated/matmul_i8.c: Regenerate.
	* generated/matmul_c16.c: Regenerate.
	* generated/matmul_r10.c: Regenerate.
	* generated/matmul_r4.c: Regenerate.
	* generated/matmul_c10.c: Regenerate.
	* generated/matmul_c4.c: Regenerate.
	* generated/matmul_i4.c: Regenerate.
	* generated/matmul_i16.c: Regenerate.

From-SVN: r117948
This commit is contained in:
Francois-Xavier Coudert 2006-10-22 09:41:48 +02:00 committed by François-Xavier Coudert
parent d4f6a8099d
commit 5a0aad3165
23 changed files with 726 additions and 43 deletions

View File

@ -1,3 +1,25 @@
2006-10-22 Francois-Xavier Coudert <coudert@clipper.ens.fr>
PR fortran/26025
* lang.opt: Add -fexternal-blas and -fblas-matmul-limit options.
* options.c (gfc_init_options): Initialize new flags.
(gfc_handle_option): Handle new flags.
* gfortran.h (gfc_option): Add flag_external_blas and
blas_matmul_limit flags.
* trans-expr.c (gfc_conv_function_call): Use new argument
append_args, appending it at the end of the argument list
built for a function call.
* trans-stmt.c (gfc_trans_call): Use NULL_TREE for the new
append_args argument to gfc_trans_call.
* trans.h (gfc_conv_function_call): Update prototype.
* trans-decl.c (gfc_build_intrinsic_function_decls): Add
prototypes for BLAS ?gemm routines.
* trans-intrinsic.c (gfc_conv_intrinsic_funcall): Generate the
extra arguments given to the library matmul function, and give
them to gfc_conv_function_call.
* invoke.texi: Add documentation for -fexternal-blas and
-fblas-matmul-limit.
2006-10-21 Kaveh R. Ghazi <ghazi@caip.rutgers.edu>
* Make-lang.in (F95_LIBS): Delete.

View File

@ -1652,6 +1652,8 @@ typedef struct
int flag_f2c;
int flag_automatic;
int flag_backslash;
int flag_external_blas;
int blas_matmul_limit;
int flag_cray_pointer;
int flag_d_lines;
int flag_openmp;

View File

@ -152,7 +152,8 @@ by type. Explanations are in the following sections.
@gccoptlist{
-fno-automatic -ff2c -fno-underscoring -fsecond-underscore @gol
-fbounds-check -fmax-stack-var-size=@var{n} @gol
-fpack-derived -frepack-arrays -fshort-enums}
-fpack-derived -frepack-arrays -fshort-enums -fexternal-blas
-fblas-matmul-limit=@var{n}}
@end table
@menu
@ -859,6 +860,27 @@ This option is provided for interoperability with C code that was
compiled with the @command{-fshort-enums} option. It will make
GNU Fortran choose the smallest @code{INTEGER} kind a given
enumerator set will fit in, and give all its enumerators this kind.
@cindex -fexternal-blas
@item -fexternal-blas
This option will make gfortran generate calls to BLAS functions for some
matrix operations like @code{MATMUL}, instead of using our own
algorithms, if the size of the matrices involved is larger than a given
limit (see @command{-fblas-matmul-limit}). This may be profitable if an
optimized vendor BLAS library is available. The BLAS library will have
to be specified at link time.
@cindex -fblas-matmul-limit
@item -fblas-matmul-limit=@var{n}
Only significant when @command{-fexternal-blas} is in effect.
Matrix multiplication of matrices with size larger than (or equal to) @var{n}
will be performed by calls to BLAS functions, while others will be
handled by @command{gfortran} internal algorithms. If the matrices
involved are not square, the size comparison is performed using the
geometric mean of the dimensions of the argument and result matrices.
The default value for @var{n} is 30.
@end table
@xref{Code Gen Options,,Options for Code Generation Conventions,

View File

@ -85,6 +85,14 @@ fbackslash
Fortran
Specify that backslash in string introduces an escape character
fexternal-blas
Fortran
Specify that an external BLAS library should be used for matmul calls on large-size arrays
fblas-matmul-limit=
Fortran RejectNegative Joined UInteger
-fblas-matmul-limit=<n> Size of the smallest matrix for which matmul will use BLAS
fdefault-double-8
Fortran
Set the default double precision kind to an 8 byte wide type

View File

@ -80,6 +80,8 @@ gfc_init_options (unsigned int argc ATTRIBUTE_UNUSED,
gfc_option.flag_preprocessed = 0;
gfc_option.flag_automatic = 1;
gfc_option.flag_backslash = 1;
gfc_option.flag_external_blas = 0;
gfc_option.blas_matmul_limit = 30;
gfc_option.flag_cray_pointer = 0;
gfc_option.flag_d_lines = -1;
gfc_option.flag_openmp = 0;
@ -450,6 +452,14 @@ gfc_handle_option (size_t scode, const char *arg, int value)
gfc_option.flag_dollar_ok = value;
break;
case OPT_fexternal_blas:
gfc_option.flag_external_blas = value;
break;
case OPT_fblas_matmul_limit_:
gfc_option.blas_matmul_limit = value;
break;
case OPT_fd_lines_as_code:
gfc_option.flag_d_lines = 1;
break;

View File

@ -143,6 +143,12 @@ tree gfor_fndecl_iargc;
tree gfor_fndecl_si_kind;
tree gfor_fndecl_sr_kind;
/* BLAS gemm functions. */
tree gfor_fndecl_sgemm;
tree gfor_fndecl_dgemm;
tree gfor_fndecl_cgemm;
tree gfor_fndecl_zgemm;
static void
gfc_add_decl_to_parent_function (tree decl)
@ -2186,6 +2192,49 @@ gfc_build_intrinsic_function_decls (void)
gfc_int4_type_node, 1,
gfc_real16_type_node);
/* BLAS functions. */
{
tree pint = build_pointer_type (gfc_c_int_type_node);
tree ps = build_pointer_type (gfc_get_real_type (gfc_default_real_kind));
tree pd = build_pointer_type (gfc_get_real_type (gfc_default_double_kind));
tree pc = build_pointer_type (gfc_get_complex_type (gfc_default_real_kind));
tree pz = build_pointer_type
(gfc_get_complex_type (gfc_default_double_kind));
gfor_fndecl_sgemm = gfc_build_library_function_decl
(get_identifier
(gfc_option.flag_underscoring ? "sgemm_"
: "sgemm"),
void_type_node, 15, pchar_type_node,
pchar_type_node, pint, pint, pint, ps, ps, pint,
ps, pint, ps, ps, pint, gfc_c_int_type_node,
gfc_c_int_type_node);
gfor_fndecl_dgemm = gfc_build_library_function_decl
(get_identifier
(gfc_option.flag_underscoring ? "dgemm_"
: "dgemm"),
void_type_node, 15, pchar_type_node,
pchar_type_node, pint, pint, pint, pd, pd, pint,
pd, pint, pd, pd, pint, gfc_c_int_type_node,
gfc_c_int_type_node);
gfor_fndecl_cgemm = gfc_build_library_function_decl
(get_identifier
(gfc_option.flag_underscoring ? "cgemm_"
: "cgemm"),
void_type_node, 15, pchar_type_node,
pchar_type_node, pint, pint, pint, pc, pc, pint,
pc, pint, pc, pc, pint, gfc_c_int_type_node,
gfc_c_int_type_node);
gfor_fndecl_zgemm = gfc_build_library_function_decl
(get_identifier
(gfc_option.flag_underscoring ? "zgemm_"
: "zgemm"),
void_type_node, 15, pchar_type_node,
pchar_type_node, pint, pint, pint, pz, pz, pint,
pz, pint, pz, pz, pint, gfc_c_int_type_node,
gfc_c_int_type_node);
}
/* Other functions. */
gfor_fndecl_size0 =
gfc_build_library_function_decl (get_identifier (PREFIX("size0")),

View File

@ -1853,7 +1853,7 @@ is_aliased_array (gfc_expr * e)
int
gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
gfc_actual_arglist * arg)
gfc_actual_arglist * arg, tree append_args)
{
gfc_interface_mapping mapping;
tree arglist;
@ -2226,6 +2226,11 @@ gfc_conv_function_call (gfc_se * se, gfc_symbol * sym,
/* Add the hidden string length parameters to the arguments. */
arglist = chainon (arglist, stringargs);
/* We may want to append extra arguments here. This is used e.g. for
calls to libgfortran_matmul_??, which need extra information. */
if (append_args != NULL_TREE)
arglist = chainon (arglist, append_args);
/* Generate the actual call. */
gfc_conv_function_val (se, sym);
/* If there are alternate return labels, function type should be
@ -2545,7 +2550,7 @@ gfc_conv_function_expr (gfc_se * se, gfc_expr * expr)
sym = expr->value.function.esym;
if (!sym)
sym = expr->symtree->n.sym;
gfc_conv_function_call (se, sym, expr->value.function.actual);
gfc_conv_function_call (se, sym, expr->value.function.actual, NULL_TREE);
}

View File

@ -1378,6 +1378,7 @@ static void
gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
{
gfc_symbol *sym;
tree append_args;
gcc_assert (!se->ss || se->ss->expr == expr);
@ -1387,7 +1388,54 @@ gfc_conv_intrinsic_funcall (gfc_se * se, gfc_expr * expr)
gcc_assert (expr->rank == 0);
sym = gfc_get_symbol_for_expr (expr);
gfc_conv_function_call (se, sym, expr->value.function.actual);
/* Calls to libgfortran_matmul need to be appended special arguments,
to be able to call the BLAS ?gemm functions if required and possible. */
append_args = NULL_TREE;
if (expr->value.function.isym->generic_id == GFC_ISYM_MATMUL
&& sym->ts.type != BT_LOGICAL)
{
tree cint = gfc_get_int_type (gfc_c_int_kind);
if (gfc_option.flag_external_blas
&& (sym->ts.type == BT_REAL || sym->ts.type == BT_COMPLEX)
&& (sym->ts.kind == gfc_default_real_kind
|| sym->ts.kind == gfc_default_double_kind))
{
tree gemm_fndecl;
if (sym->ts.type == BT_REAL)
{
if (sym->ts.kind == gfc_default_real_kind)
gemm_fndecl = gfor_fndecl_sgemm;
else
gemm_fndecl = gfor_fndecl_dgemm;
}
else
{
if (sym->ts.kind == gfc_default_real_kind)
gemm_fndecl = gfor_fndecl_cgemm;
else
gemm_fndecl = gfor_fndecl_zgemm;
}
append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 1));
append_args = gfc_chainon_list
(append_args, build_int_cst
(cint, gfc_option.blas_matmul_limit));
append_args = gfc_chainon_list (append_args,
gfc_build_addr_expr (NULL_TREE,
gemm_fndecl));
}
else
{
append_args = gfc_chainon_list (NULL_TREE, build_int_cst (cint, 0));
append_args = gfc_chainon_list (append_args, build_int_cst (cint, 0));
append_args = gfc_chainon_list (append_args, null_pointer_node);
}
}
gfc_conv_function_call (se, sym, expr->value.function.actual, append_args);
gfc_free (sym);
}

View File

@ -334,7 +334,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
/* Translate the call. */
has_alternate_specifier
= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual);
= gfc_conv_function_call (&se, code->resolved_sym, code->ext.actual,
NULL_TREE);
/* A subroutine without side-effect, by definition, does nothing! */
TREE_SIDE_EFFECTS (se.expr) = 1;
@ -399,7 +400,8 @@ gfc_trans_call (gfc_code * code, bool dependency_check)
gfc_init_block (&block);
/* Add the subroutine call to the block. */
gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual);
gfc_conv_function_call (&loopse, code->resolved_sym, code->ext.actual,
NULL_TREE);
gfc_add_expr_to_block (&loopse.pre, loopse.expr);
gfc_add_block_to_block (&block, &loopse.pre);

View File

@ -303,7 +303,8 @@ void gfc_conv_intrinsic_function (gfc_se *, gfc_expr *);
int gfc_is_intrinsic_libcall (gfc_expr *);
/* Also used to CALL subroutines. */
int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *);
int gfc_conv_function_call (gfc_se *, gfc_symbol *, gfc_actual_arglist *,
tree);
/* gfc_trans_* shouldn't call push/poplevel, use gfc_push/pop_scope */
/* Generate code for a scalar assignment. */
@ -507,6 +508,12 @@ extern GTY(()) tree gfor_fndecl_math_exponent8;
extern GTY(()) tree gfor_fndecl_math_exponent10;
extern GTY(()) tree gfor_fndecl_math_exponent16;
/* BLAS functions. */
extern GTY(()) tree gfor_fndecl_sgemm;
extern GTY(()) tree gfor_fndecl_dgemm;
extern GTY(()) tree gfor_fndecl_cgemm;
extern GTY(()) tree gfor_fndecl_zgemm;
/* String functions. */
extern GTY(()) tree gfor_fndecl_compare_string;
extern GTY(()) tree gfor_fndecl_concat_string;

View File

@ -1,3 +1,19 @@
2006-10-22 Francois-Xavier Coudert <coudert@clipper.ens.fr>
PR fortran/26025
* m4/matmul.m4: Add possible call to gemm routine.
* generated/matmul_r8.c: Regenerate.
* generated/matmul_r16.c: Regenerate.
* generated/matmul_c8.c: Regenerate.
* generated/matmul_i8.c: Regenerate.
* generated/matmul_c16.c: Regenerate.
* generated/matmul_r10.c: Regenerate.
* generated/matmul_r4.c: Regenerate.
* generated/matmul_c10.c: Regenerate.
* generated/matmul_c4.c: Regenerate.
* generated/matmul_i4.c: Regenerate.
* generated/matmul_i16.c: Regenerate.
2006-10-21 Steven G. Kargl <kargl@gcc.gnu.org>
* runtime/error.c: Add errno.h

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_10)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_COMPLEX_10 *, const GFC_COMPLEX_10 *,
const int *, const GFC_COMPLEX_10 *, const int *,
const GFC_COMPLEX_10 *, GFC_COMPLEX_10 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b);
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c10);
void
matmul_c10 (gfc_array_c10 * const restrict retarray,
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b)
gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_COMPLEX_10 * restrict abase;
const GFC_COMPLEX_10 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_COMPLEX_10 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_COMPLEX_10 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_16)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_COMPLEX_16 *, const GFC_COMPLEX_16 *,
const int *, const GFC_COMPLEX_16 *, const int *,
const GFC_COMPLEX_16 *, GFC_COMPLEX_16 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b);
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c16);
void
matmul_c16 (gfc_array_c16 * const restrict retarray,
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b)
gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_COMPLEX_16 * restrict abase;
const GFC_COMPLEX_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_COMPLEX_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_COMPLEX_16 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_4)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_COMPLEX_4 *, const GFC_COMPLEX_4 *,
const int *, const GFC_COMPLEX_4 *, const int *,
const GFC_COMPLEX_4 *, GFC_COMPLEX_4 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b);
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c4);
void
matmul_c4 (gfc_array_c4 * const restrict retarray,
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b)
gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_COMPLEX_4 * restrict abase;
const GFC_COMPLEX_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_COMPLEX_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_COMPLEX_4 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_8)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_COMPLEX_8 *, const GFC_COMPLEX_8 *,
const int *, const GFC_COMPLEX_8 *, const int *,
const GFC_COMPLEX_8 *, GFC_COMPLEX_8 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b);
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_c8);
void
matmul_c8 (gfc_array_c8 * const restrict retarray,
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b)
gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_COMPLEX_8 * restrict abase;
const GFC_COMPLEX_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_COMPLEX_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_COMPLEX_8 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_16)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_INTEGER_16 *, const GFC_INTEGER_16 *,
const int *, const GFC_INTEGER_16 *, const int *,
const GFC_INTEGER_16 *, GFC_INTEGER_16 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b);
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i16);
void
matmul_i16 (gfc_array_i16 * const restrict retarray,
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b)
gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_INTEGER_16 * restrict abase;
const GFC_INTEGER_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_INTEGER_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_INTEGER_16 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_4)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_INTEGER_4 *, const GFC_INTEGER_4 *,
const int *, const GFC_INTEGER_4 *, const int *,
const GFC_INTEGER_4 *, GFC_INTEGER_4 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b);
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i4);
void
matmul_i4 (gfc_array_i4 * const restrict retarray,
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b)
gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_INTEGER_4 * restrict abase;
const GFC_INTEGER_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_INTEGER_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_INTEGER_4 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_8)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_INTEGER_8 *, const GFC_INTEGER_8 *,
const int *, const GFC_INTEGER_8 *, const int *,
const GFC_INTEGER_8 *, GFC_INTEGER_8 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b);
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_i8);
void
matmul_i8 (gfc_array_i8 * const restrict retarray,
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b)
gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_INTEGER_8 * restrict abase;
const GFC_INTEGER_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_INTEGER_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_INTEGER_8 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_10)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_REAL_10 *, const GFC_REAL_10 *,
const int *, const GFC_REAL_10 *, const int *,
const GFC_REAL_10 *, GFC_REAL_10 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b);
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r10);
void
matmul_r10 (gfc_array_r10 * const restrict retarray,
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b)
gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_REAL_10 * restrict abase;
const GFC_REAL_10 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_REAL_10 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_REAL_10 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_16)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_REAL_16 *, const GFC_REAL_16 *,
const int *, const GFC_REAL_16 *, const int *,
const GFC_REAL_16 *, GFC_REAL_16 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b);
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r16);
void
matmul_r16 (gfc_array_r16 * const restrict retarray,
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b)
gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_REAL_16 * restrict abase;
const GFC_REAL_16 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_REAL_16 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_REAL_16 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_4)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_REAL_4 *, const GFC_REAL_4 *,
const int *, const GFC_REAL_4 *, const int *,
const GFC_REAL_4 *, GFC_REAL_4 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b);
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r4);
void
matmul_r4 (gfc_array_r4 * const restrict retarray,
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b)
gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_REAL_4 * restrict abase;
const GFC_REAL_4 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_REAL_4 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_REAL_4 * restrict bbase_y;

View File

@ -36,6 +36,16 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_8)
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const GFC_REAL_8 *, const GFC_REAL_8 *,
const int *, const GFC_REAL_8 *, const int *,
const GFC_REAL_8 *, GFC_REAL_8 *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -56,18 +66,24 @@ Boston, MA 02110-1301, USA. */
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b);
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_r8);
void
matmul_r8 (gfc_array_r8 * const restrict retarray,
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b)
gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const GFC_REAL_8 * restrict abase;
const GFC_REAL_8 * restrict bbase;
@ -177,6 +193,31 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const GFC_REAL_8 one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const GFC_REAL_8 * restrict bbase_y;

View File

@ -37,6 +37,16 @@ include(iparm.m4)dnl
`#if defined (HAVE_'rtype_name`)'
/* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
passed to us by the front-end, in which case we'll call it for large
matrices. */
typedef void (*blas_call)(const char *, const char *, const int *, const int *,
const int *, const rtype_name *, const rtype_name *,
const int *, const rtype_name *, const int *,
const rtype_name *, rtype_name *, const int *,
int, int);
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
@ -57,18 +67,24 @@ include(iparm.m4)dnl
DO I=1,M
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
S = S+A(I,K)*B(K,J)
C(I,J) = S
ENDIF
*/
/* If try_blas is set to a nonzero value, then the matmul function will
see if there is a way to perform the matrix multiplication by a call
to the BLAS gemm function. */
extern void matmul_`'rtype_code (rtype * const restrict retarray,
rtype * const restrict a, rtype * const restrict b);
rtype * const restrict a, rtype * const restrict b, int try_blas,
int blas_limit, blas_call gemm);
export_proto(matmul_`'rtype_code);
void
matmul_`'rtype_code (rtype * const restrict retarray,
rtype * const restrict a, rtype * const restrict b)
rtype * const restrict a, rtype * const restrict b, int try_blas,
int blas_limit, blas_call gemm)
{
const rtype_name * restrict abase;
const rtype_name * restrict bbase;
@ -179,6 +195,31 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
bbase = b->data;
dest = retarray->data;
/* Now that everything is set up, we're performing the multiplication
itself. */
#define POW3(x) (((float) (x)) * ((float) (x)) * ((float) (x)))
if (try_blas && rxstride == 1 && (axstride == 1 || aystride == 1)
&& (bxstride == 1 || bystride == 1)
&& (((float) xcount) * ((float) ycount) * ((float) count)
> POW3(blas_limit)))
{
const int m = xcount, n = ycount, k = count, ldc = rystride;
const rtype_name one = 1, zero = 0;
const int lda = (axstride == 1) ? aystride : axstride,
ldb = (bxstride == 1) ? bystride : bxstride;
if (lda > 0 && ldb > 0 && ldc > 0 && m > 1 && n > 1 && k > 1)
{
assert (gemm != NULL);
gemm (axstride == 1 ? "N" : "T", bxstride == 1 ? "N" : "T", &m, &n, &k,
&one, abase, &lda, bbase, &ldb, &zero, dest, &ldc, 1, 1);
return;
}
}
if (rxstride == 1 && axstride == 1 && bxstride == 1)
{
const rtype_name * restrict bbase_y;