Make-lang.in (fortran/trans-resolve.o): Depend on fortran/dependency.h.

gcc/fortran/
	* Make-lang.in (fortran/trans-resolve.o): Depend on
	fortran/dependency.h.
	* gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
	* dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
	(gfc_check_fncall_dependency): Change prototype.
	* dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
	(gfc_check_argument_var_dependency): New function, split from
	gfc_check_fncall_dependency.
	(gfc_check_argument_dependency): New function.
	(gfc_check_fncall_dependency): Replace the expression parameter with
	separate symbol and argument list parameters.  Generalize the function
	to handle dependencies for any type of expression, not just variables.
	Accept a further argument giving the intent of the expression being
	tested.  Ignore	intent(in) arguments if that expression is also
	intent(in).
	* resolve.c: Include dependency.h.
	(find_noncopying_intrinsics): New function.
	(resolve_function, resolve_call): Call it on success.
	* trans-array.h (gfc_conv_array_transpose): Declare.
	(gfc_check_fncall_dependency): Remove prototype.
	* trans-array.c (gfc_conv_array_transpose): New function.
	* trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
	libcall handling if the expression is to be evaluated inline.
	Add a case for handling inline transpose()s.
	* trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
	interface provided by gfc_check_fncall_dependency.

libgfortran/
	* m4/matmul.m4: Use a different order in the special case of a
	transposed first argument.
	* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
	* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
	* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
	* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.

Co-Authored-By: Victor Leikehman <LEI@il.ibm.com>

From-SVN: r108459
This commit is contained in:
Richard Sandiford 2005-12-13 05:23:12 +00:00 committed by Richard Sandiford
parent 264c41eda5
commit 1524f80b1c
23 changed files with 1051 additions and 146 deletions

View File

@ -1,3 +1,32 @@
2005-12-13 Richard Sandiford <richard@codesourcery.com>
* Make-lang.in (fortran/trans-resolve.o): Depend on
fortran/dependency.h.
* gfortran.h (gfc_expr): Add an "inline_noncopying_intrinsic" flag.
* dependency.h (gfc_get_noncopying_intrinsic_argument): Declare.
(gfc_check_fncall_dependency): Change prototype.
* dependency.c (gfc_get_noncopying_intrinsic_argument): New function.
(gfc_check_argument_var_dependency): New function, split from
gfc_check_fncall_dependency.
(gfc_check_argument_dependency): New function.
(gfc_check_fncall_dependency): Replace the expression parameter with
separate symbol and argument list parameters. Generalize the function
to handle dependencies for any type of expression, not just variables.
Accept a further argument giving the intent of the expression being
tested. Ignore intent(in) arguments if that expression is also
intent(in).
* resolve.c: Include dependency.h.
(find_noncopying_intrinsics): New function.
(resolve_function, resolve_call): Call it on success.
* trans-array.h (gfc_conv_array_transpose): Declare.
(gfc_check_fncall_dependency): Remove prototype.
* trans-array.c (gfc_conv_array_transpose): New function.
* trans-intrinsic.c (gfc_conv_intrinsic_function): Don't use the
libcall handling if the expression is to be evaluated inline.
Add a case for handling inline transpose()s.
* trans-expr.c (gfc_trans_arrayfunc_assign): Adjust for the new
interface provided by gfc_check_fncall_dependency.
2005-12-12 Steven G. Kargl <kargls@comcast.net>
PR fortran/25078

View File

@ -286,4 +286,5 @@ fortran/trans-intrinsic.o: $(GFORTRAN_TRANS_DEPS) fortran/mathbuiltins.def \
gt-fortran-trans-intrinsic.h
fortran/dependency.o: $(GFORTRAN_TRANS_DEPS) fortran/dependency.h
fortran/trans-common.o: $(GFORTRAN_TRANS_DEPS)
fortran/resolve.o: fortran/dependency.h

View File

@ -175,6 +175,32 @@ gfc_is_same_range (gfc_array_ref * ar1, gfc_array_ref * ar2, int n, int def)
}
/* Some array-returning intrinsics can be implemented by reusing the
data from one of the array arguments. For example, TRANPOSE does
not necessarily need to allocate new data: it can be implemented
by copying the original array's descriptor and simply swapping the
two dimension specifications.
If EXPR is a call to such an intrinsic, return the argument
whose data can be reused, otherwise return NULL. */
gfc_expr *
gfc_get_noncopying_intrinsic_argument (gfc_expr * expr)
{
if (expr->expr_type != EXPR_FUNCTION || !expr->value.function.isym)
return NULL;
switch (expr->value.function.isym->generic_id)
{
case GFC_ISYM_TRANSPOSE:
return expr->value.function.actual->expr;
default:
return NULL;
}
}
/* Return true if the result of reference REF can only be constructed
using a temporary array. */
@ -214,23 +240,82 @@ gfc_ref_needs_temporary_p (gfc_ref *ref)
}
/* Dependency checking for direct function return by reference.
Returns true if the arguments of the function depend on the
destination. This is considerably less conservative than other
dependencies because many function arguments will already be
copied into a temporary. */
/* Return true if array variable VAR could be passed to the same function
as argument EXPR without interfering with EXPR. INTENT is the intent
of VAR.
This is considerably less conservative than other dependencies
because many function arguments will already be copied into a
temporary. */
static int
gfc_check_argument_var_dependency (gfc_expr * var, sym_intent intent,
gfc_expr * expr)
{
gcc_assert (var->expr_type == EXPR_VARIABLE);
gcc_assert (var->rank > 0);
switch (expr->expr_type)
{
case EXPR_VARIABLE:
return (gfc_ref_needs_temporary_p (expr->ref)
|| gfc_check_dependency (var, expr, NULL, 0));
case EXPR_ARRAY:
return gfc_check_dependency (var, expr, NULL, 0);
case EXPR_FUNCTION:
if (intent != INTENT_IN && expr->inline_noncopying_intrinsic)
{
expr = gfc_get_noncopying_intrinsic_argument (expr);
return gfc_check_argument_var_dependency (var, intent, expr);
}
return 0;
default:
return 0;
}
}
/* Like gfc_check_argument_var_dependency, but extended to any
array expression OTHER, not just variables. */
static int
gfc_check_argument_dependency (gfc_expr * other, sym_intent intent,
gfc_expr * expr)
{
switch (other->expr_type)
{
case EXPR_VARIABLE:
return gfc_check_argument_var_dependency (other, intent, expr);
case EXPR_FUNCTION:
if (other->inline_noncopying_intrinsic)
{
other = gfc_get_noncopying_intrinsic_argument (other);
return gfc_check_argument_dependency (other, INTENT_IN, expr);
}
return 0;
default:
return 0;
}
}
/* Like gfc_check_argument_dependency, but check all the arguments in ACTUAL.
FNSYM is the function being called, or NULL if not known. */
int
gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
gfc_check_fncall_dependency (gfc_expr * other, sym_intent intent,
gfc_symbol * fnsym, gfc_actual_arglist * actual)
{
gfc_actual_arglist *actual;
gfc_formal_arglist *formal;
gfc_expr *expr;
gcc_assert (dest->expr_type == EXPR_VARIABLE
&& fncall->expr_type == EXPR_FUNCTION);
gcc_assert (fncall->rank > 0);
for (actual = fncall->value.function.actual; actual; actual = actual->next)
formal = fnsym ? fnsym->formal : NULL;
for (; actual; actual = actual->next, formal = formal ? formal->next : NULL)
{
expr = actual->expr;
@ -238,23 +323,14 @@ gfc_check_fncall_dependency (gfc_expr * dest, gfc_expr * fncall)
if (!expr)
continue;
/* Non-variable expressions will be allocated temporaries anyway. */
switch (expr->expr_type)
{
case EXPR_VARIABLE:
if (!gfc_ref_needs_temporary_p (expr->ref)
&& gfc_check_dependency (dest, expr, NULL, 0))
return 1;
break;
/* Skip intent(in) arguments if OTHER itself is intent(in). */
if (formal
&& intent == INTENT_IN
&& formal->sym->attr.intent == INTENT_IN)
continue;
case EXPR_ARRAY:
if (gfc_check_dependency (dest, expr, NULL, 0))
return 1;
break;
default:
break;
}
if (gfc_check_argument_dependency (other, intent, expr))
return 1;
}
return 0;

View File

@ -22,7 +22,9 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
bool gfc_ref_needs_temporary_p (gfc_ref *);
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
gfc_expr *gfc_get_noncopying_intrinsic_argument (gfc_expr *);
int gfc_check_fncall_dependency (gfc_expr *, sym_intent, gfc_symbol *,
gfc_actual_arglist *);
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
int gfc_is_same_range (gfc_array_ref *, gfc_array_ref *, int, int);
int gfc_dep_compare_expr (gfc_expr *, gfc_expr *);

View File

@ -1129,6 +1129,9 @@ typedef struct gfc_expr
/* True if it is converted from Hollerith constant. */
unsigned int from_H : 1;
/* True if the expression is a call to a function that returns an array,
and if we have decided not to allocate temporary data for that array. */
unsigned int inline_noncopying_intrinsic : 1;
union
{

View File

@ -24,6 +24,7 @@ Software Foundation, 51 Franklin Street, Fifth Floor,Boston, MA
#include "system.h"
#include "gfortran.h"
#include "arith.h" /* For gfc_compare_expr(). */
#include "dependency.h"
/* Types used in equivalence statements. */
@ -804,6 +805,24 @@ resolve_actual_arglist (gfc_actual_arglist * arg)
}
/* Go through each actual argument in ACTUAL and see if it can be
implemented as an inlined, non-copying intrinsic. FNSYM is the
function being called, or NULL if not known. */
static void
find_noncopying_intrinsics (gfc_symbol * fnsym, gfc_actual_arglist * actual)
{
gfc_actual_arglist *ap;
gfc_expr *expr;
for (ap = actual; ap; ap = ap->next)
if (ap->expr
&& (expr = gfc_get_noncopying_intrinsic_argument (ap->expr))
&& !gfc_check_fncall_dependency (expr, INTENT_IN, fnsym, actual))
ap->expr->inline_noncopying_intrinsic = 1;
}
/************* Function resolution *************/
/* Resolve a function call known to be generic.
@ -1150,6 +1169,9 @@ resolve_function (gfc_expr * expr)
}
}
if (t == SUCCESS)
find_noncopying_intrinsics (expr->value.function.esym,
expr->value.function.actual);
return t;
}
@ -1372,27 +1394,28 @@ resolve_call (gfc_code * c)
if (resolve_actual_arglist (c->ext.actual) == FAILURE)
return FAILURE;
if (c->resolved_sym != NULL)
return SUCCESS;
t = SUCCESS;
if (c->resolved_sym == NULL)
switch (procedure_kind (c->symtree->n.sym))
{
case PTYPE_GENERIC:
t = resolve_generic_s (c);
break;
switch (procedure_kind (c->symtree->n.sym))
{
case PTYPE_GENERIC:
t = resolve_generic_s (c);
break;
case PTYPE_SPECIFIC:
t = resolve_specific_s (c);
break;
case PTYPE_SPECIFIC:
t = resolve_specific_s (c);
break;
case PTYPE_UNKNOWN:
t = resolve_unknown_s (c);
break;
case PTYPE_UNKNOWN:
t = resolve_unknown_s (c);
break;
default:
gfc_internal_error ("resolve_subroutine(): bad function type");
}
default:
gfc_internal_error ("resolve_subroutine(): bad function type");
}
if (t == SUCCESS)
find_noncopying_intrinsics (c->resolved_sym, c->ext.actual);
return t;
}

View File

@ -673,6 +673,95 @@ gfc_trans_allocate_temp_array (stmtblock_t * pre, stmtblock_t * post,
}
/* Generate code to tranpose array EXPR by creating a new descriptor
in which the dimension specifications have been reversed. */
void
gfc_conv_array_transpose (gfc_se * se, gfc_expr * expr)
{
tree dest, src, dest_index, src_index;
gfc_loopinfo *loop;
gfc_ss_info *dest_info, *src_info;
gfc_ss *dest_ss, *src_ss;
gfc_se src_se;
int n;
loop = se->loop;
src_ss = gfc_walk_expr (expr);
dest_ss = se->ss;
src_info = &src_ss->data.info;
dest_info = &dest_ss->data.info;
/* Get a descriptor for EXPR. */
gfc_init_se (&src_se, NULL);
gfc_conv_expr_descriptor (&src_se, expr, src_ss);
gfc_add_block_to_block (&se->pre, &src_se.pre);
gfc_add_block_to_block (&se->post, &src_se.post);
src = src_se.expr;
/* Allocate a new descriptor for the return value. */
dest = gfc_create_var (TREE_TYPE (src), "atmp");
dest_info->descriptor = dest;
se->expr = dest;
/* Copy across the dtype field. */
gfc_add_modify_expr (&se->pre,
gfc_conv_descriptor_dtype (dest),
gfc_conv_descriptor_dtype (src));
/* Copy the dimension information, renumbering dimension 1 to 0 and
0 to 1. */
gcc_assert (dest_info->dimen == 2);
gcc_assert (src_info->dimen == 2);
for (n = 0; n < 2; n++)
{
dest_info->delta[n] = integer_zero_node;
dest_info->start[n] = integer_zero_node;
dest_info->stride[n] = integer_one_node;
dest_info->dim[n] = n;
dest_index = gfc_rank_cst[n];
src_index = gfc_rank_cst[1 - n];
gfc_add_modify_expr (&se->pre,
gfc_conv_descriptor_stride (dest, dest_index),
gfc_conv_descriptor_stride (src, src_index));
gfc_add_modify_expr (&se->pre,
gfc_conv_descriptor_lbound (dest, dest_index),
gfc_conv_descriptor_lbound (src, src_index));
gfc_add_modify_expr (&se->pre,
gfc_conv_descriptor_ubound (dest, dest_index),
gfc_conv_descriptor_ubound (src, src_index));
if (!loop->to[n])
{
gcc_assert (integer_zerop (loop->from[n]));
loop->to[n] = build2 (MINUS_EXPR, gfc_array_index_type,
gfc_conv_descriptor_ubound (dest, dest_index),
gfc_conv_descriptor_lbound (dest, dest_index));
}
}
/* Copy the data pointer. */
dest_info->data = gfc_conv_descriptor_data_get (src);
gfc_conv_descriptor_data_set (&se->pre, dest, dest_info->data);
/* Copy the offset. This is not changed by transposition: the top-left
element is still at the same offset as before. */
dest_info->offset = gfc_conv_descriptor_offset (src);
gfc_add_modify_expr (&se->pre,
gfc_conv_descriptor_offset (dest),
dest_info->offset);
if (dest_info->dimen > loop->temp_dim)
loop->temp_dim = dest_info->dimen;
}
/* Return the number of iterations in a loop that starts at START,
ends at END, and has step STEP. */

View File

@ -91,6 +91,8 @@ void gfc_conv_tmp_ref (gfc_se *);
void gfc_conv_expr_descriptor (gfc_se *, gfc_expr *, gfc_ss *);
/* Convert an array for passing as an actual function parameter. */
void gfc_conv_array_parameter (gfc_se *, gfc_expr *, gfc_ss *, int);
/* Evaluate and transpose a matrix expression. */
void gfc_conv_array_transpose (gfc_se *, gfc_expr *);
/* These work with both descriptors and descriptorless arrays. */
tree gfc_conv_array_data (tree);
@ -112,8 +114,6 @@ tree gfc_conv_descriptor_ubound (tree, tree);
/* Dependency checking for WHERE and FORALL. */
int gfc_check_dependency (gfc_expr *, gfc_expr *, gfc_expr **, int);
/* Dependency checking for function calls. */
int gfc_check_fncall_dependency (gfc_expr *, gfc_expr *);
/* Add pre-loop scalarization code for intrinsic functions which require
special handling. */

View File

@ -2627,7 +2627,9 @@ gfc_trans_arrayfunc_assign (gfc_expr * expr1, gfc_expr * expr2)
}
/* Check for a dependency. */
if (gfc_check_fncall_dependency (expr1, expr2))
if (gfc_check_fncall_dependency (expr1, INTENT_OUT,
expr2->value.function.esym,
expr2->value.function.actual))
return NULL;
/* The frontend doesn't seem to bother filling in expr->symtree for intrinsic

View File

@ -2894,7 +2894,7 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
name = &expr->value.function.name[2];
if (expr->rank > 0)
if (expr->rank > 0 && !expr->inline_noncopying_intrinsic)
{
lib = gfc_is_intrinsic_libcall (expr);
if (lib != 0)
@ -3119,6 +3119,16 @@ gfc_conv_intrinsic_function (gfc_se * se, gfc_expr * expr)
gfc_conv_intrinsic_bound (se, expr, 0);
break;
case GFC_ISYM_TRANSPOSE:
if (se->ss && se->ss->useflags)
{
gfc_conv_tmp_array_ref (se);
gfc_advance_se_ss_chain (se);
}
else
gfc_conv_array_transpose (se, expr->value.function.actual->expr);
break;
case GFC_ISYM_LEN:
gfc_conv_intrinsic_len (se, expr);
break;

View File

@ -1,3 +1,13 @@
2005-12-13 Richard Sandiford <richard@codesourcery.com>
Victor Leikehman <LEI@il.ibm.com>
* m4/matmul.m4: Use a different order in the special case of a
transposed first argument.
* generated/matmul_c4.c, generated/matmul_c8.c, generated/matmul_c10.c,
* generated/matmul_c16.c, generated/matmul_i4.c, generated/matmul_i8.c,
* generated/matmul_i10.c, generated/matmul_r4.c, generated/matmul_r8.c
* generated/matmul_r10.c, generated/matmul_r16.c: Regenerated.
2005-12-10 Janne Blomqvist <jb@gcc.gnu.org>
* Makefile.am: Enable loop unrolling for matmul.

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_10)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_c10 (gfc_array_c10 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_COMPLEX_10 *restrict abase_x;
const GFC_COMPLEX_10 *restrict bbase_y;
GFC_COMPLEX_10 *restrict dest_y;
GFC_COMPLEX_10 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_10) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c10 (gfc_array_c10 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_COMPLEX_10 *restrict abase_x;
const GFC_COMPLEX_10 *restrict bbase_y;
GFC_COMPLEX_10 *restrict dest_y;
GFC_COMPLEX_10 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_10) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_16)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_c16 (gfc_array_c16 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_COMPLEX_16 *restrict abase_x;
const GFC_COMPLEX_16 *restrict bbase_y;
GFC_COMPLEX_16 *restrict dest_y;
GFC_COMPLEX_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c16 (gfc_array_c16 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_COMPLEX_16 *restrict abase_x;
const GFC_COMPLEX_16 *restrict bbase_y;
GFC_COMPLEX_16 *restrict dest_y;
GFC_COMPLEX_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_4)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_c4 (gfc_array_c4 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_COMPLEX_4 *restrict abase_x;
const GFC_COMPLEX_4 *restrict bbase_y;
GFC_COMPLEX_4 *restrict dest_y;
GFC_COMPLEX_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_COMPLEX_4 *restrict abase_x;
const GFC_COMPLEX_4 *restrict bbase_y;
GFC_COMPLEX_4 *restrict dest_y;
GFC_COMPLEX_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_COMPLEX_8)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_c8 (gfc_array_c8 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_COMPLEX_8 *restrict abase_x;
const GFC_COMPLEX_8 *restrict bbase_y;
GFC_COMPLEX_8 *restrict dest_y;
GFC_COMPLEX_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_COMPLEX_8 *restrict abase_x;
const GFC_COMPLEX_8 *restrict bbase_y;
GFC_COMPLEX_8 *restrict dest_y;
GFC_COMPLEX_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_COMPLEX_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_16)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_i16 (gfc_array_i16 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_INTEGER_16 *restrict abase_x;
const GFC_INTEGER_16 *restrict bbase_y;
GFC_INTEGER_16 *restrict dest_y;
GFC_INTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i16 (gfc_array_i16 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_INTEGER_16 *restrict abase_x;
const GFC_INTEGER_16 *restrict bbase_y;
GFC_INTEGER_16 *restrict dest_y;
GFC_INTEGER_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_4)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_i4 (gfc_array_i4 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_INTEGER_4 *restrict abase_x;
const GFC_INTEGER_4 *restrict bbase_y;
GFC_INTEGER_4 *restrict dest_y;
GFC_INTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_INTEGER_4 *restrict abase_x;
const GFC_INTEGER_4 *restrict bbase_y;
GFC_INTEGER_4 *restrict dest_y;
GFC_INTEGER_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_INTEGER_8)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_i8 (gfc_array_i8 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_INTEGER_8 *restrict abase_x;
const GFC_INTEGER_8 *restrict bbase_y;
GFC_INTEGER_8 *restrict dest_y;
GFC_INTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_INTEGER_8 *restrict abase_x;
const GFC_INTEGER_8 *restrict bbase_y;
GFC_INTEGER_8 *restrict dest_y;
GFC_INTEGER_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_INTEGER_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_10)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_r10 (gfc_array_r10 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_REAL_10 *restrict abase_x;
const GFC_REAL_10 *restrict bbase_y;
GFC_REAL_10 *restrict dest_y;
GFC_REAL_10 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_10) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r10 (gfc_array_r10 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_REAL_10 *restrict abase_x;
const GFC_REAL_10 *restrict bbase_y;
GFC_REAL_10 *restrict dest_y;
GFC_REAL_10 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_10) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_16)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_r16 (gfc_array_r16 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_REAL_16 *restrict abase_x;
const GFC_REAL_16 *restrict bbase_y;
GFC_REAL_16 *restrict dest_y;
GFC_REAL_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r16 (gfc_array_r16 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_REAL_16 *restrict abase_x;
const GFC_REAL_16 *restrict bbase_y;
GFC_REAL_16 *restrict dest_y;
GFC_REAL_16 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_16) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_4)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_r4 (gfc_array_r4 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_REAL_4 *restrict abase_x;
const GFC_REAL_4 *restrict bbase_y;
GFC_REAL_4 *restrict dest_y;
GFC_REAL_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_REAL_4 *restrict abase_x;
const GFC_REAL_4 *restrict bbase_y;
GFC_REAL_4 *restrict dest_y;
GFC_REAL_4 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_4) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -36,16 +36,29 @@ Boston, MA 02110-1301, USA. */
#if defined (HAVE_GFC_REAL_8)
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_r8 (gfc_array_r8 * const restrict retarray,
@ -204,7 +217,28 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const GFC_REAL_8 *restrict abase_x;
const GFC_REAL_8 *restrict bbase_y;
GFC_REAL_8 *restrict dest_y;
GFC_REAL_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -216,6 +250,27 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const GFC_REAL_8 *restrict abase_x;
const GFC_REAL_8 *restrict bbase_y;
GFC_REAL_8 *restrict dest_y;
GFC_REAL_8 s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (GFC_REAL_8) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif

View File

@ -37,16 +37,29 @@ include(iparm.m4)dnl
`#if defined (HAVE_'rtype_name`)'
/* This is a C version of the following fortran pseudo-code. The key
point is the loop order -- we access all arrays column-first, which
improves the performance enough to boost galgel spec score by 50%.
/* The order of loops is different in the case of plain matrix
multiplication C=MATMUL(A,B), and in the frequent special case where
the argument A is the temporary result of a TRANSPOSE intrinsic:
C=MATMUL(TRANSPOSE(A),B). Transposed temporaries are detected by
looking at their strides.
The equivalent Fortran pseudo-code is:
DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
C = 0
DO J=1,N
DO K=1,COUNT
IF (.NOT.IS_TRANSPOSED(A)) THEN
C = 0
DO J=1,N
DO K=1,COUNT
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
ELSE
DO J=1,N
DO I=1,M
C(I,J) = C(I,J)+A(I,K)*B(K,J)
S = 0
DO K=1,COUNT
S = S+A(I,K)+B(K,J)
C(I,J) = S
ENDIF
*/
extern void matmul_`'rtype_code (rtype * const restrict retarray,
@ -206,7 +219,28 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
}
}
}
else
else if (rxstride == 1 && aystride == 1 && bxstride == 1)
{
const rtype_name *restrict abase_x;
const rtype_name *restrict bbase_y;
rtype_name *restrict dest_y;
rtype_name s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (rtype_name) 0;
for (n = 0; n < count; n++)
s += abase_x[n] * bbase_y[n];
dest_y[x] = s;
}
}
}
else if (axstride < aystride)
{
for (y = 0; y < ycount; y++)
for (x = 0; x < xcount; x++)
@ -218,6 +252,27 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
/* dest[x,y] += a[x,n] * b[n,y] */
dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride];
}
else
{
const rtype_name *restrict abase_x;
const rtype_name *restrict bbase_y;
rtype_name *restrict dest_y;
rtype_name s;
for (y = 0; y < ycount; y++)
{
bbase_y = &bbase[y*bystride];
dest_y = &dest[y*rystride];
for (x = 0; x < xcount; x++)
{
abase_x = &abase[x*axstride];
s = (rtype_name) 0;
for (n = 0; n < count; n++)
s += abase_x[n*aystride] * bbase_y[n*bxstride];
dest_y[x*rxstride] = s;
}
}
}
}
#endif