diff --git a/libgfortran/ChangeLog b/libgfortran/ChangeLog index 000d49656d0..15315d270f7 100644 --- a/libgfortran/ChangeLog +++ b/libgfortran/ChangeLog @@ -1,3 +1,8 @@ +2004-11-18 Victor Leikehman + + * m4/matmul.m4: Loops reordered to improve cache behavior. + * generated/matmul_??.c: Regenerated. + 2004-11-10 Paul Brook PR fortran/18218 diff --git a/libgfortran/generated/matmul_c4.c b/libgfortran/generated/matmul_c4.c index 7967e970646..fd265d8b5b1 100644 --- a/libgfortran/generated/matmul_c4.c +++ b/libgfortran/generated/matmul_c4.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) { GFC_COMPLEX_4 *abase; GFC_COMPLEX_4 *bbase; GFC_COMPLEX_4 *dest; - GFC_COMPLEX_4 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_COMPLEX_4 *pa; - GFC_COMPLEX_4 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_c4 (gfc_array_c4 * retarray, gfc_array_c4 * a, gfc_array_c4 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_COMPLEX_4 *bbase_y; + GFC_COMPLEX_4 *dest_y; + GFC_COMPLEX_4 *abase_n; + GFC_COMPLEX_4 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_COMPLEX_4) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_4)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/generated/matmul_c8.c b/libgfortran/generated/matmul_c8.c index 7ed46ec57a9..bc51e4a9db3 100644 --- a/libgfortran/generated/matmul_c8.c +++ b/libgfortran/generated/matmul_c8.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_c8 (gfc_array_c8 * retarray, gfc_array_c8 * a, gfc_array_c8 * b) { GFC_COMPLEX_8 *abase; GFC_COMPLEX_8 *bbase; GFC_COMPLEX_8 *dest; - GFC_COMPLEX_8 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_COMPLEX_8 *pa; - GFC_COMPLEX_8 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_c8 (gfc_array_c8 * retarray, gfc_array_c8 * a, gfc_array_c8 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_c8 (gfc_array_c8 * retarray, gfc_array_c8 * a, gfc_array_c8 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_COMPLEX_8 *bbase_y; + GFC_COMPLEX_8 *dest_y; + GFC_COMPLEX_8 *abase_n; + GFC_COMPLEX_8 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_COMPLEX_8) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_COMPLEX_8)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/generated/matmul_i4.c b/libgfortran/generated/matmul_i4.c index 0db573cf60c..7b8cfbdd995 100644 --- a/libgfortran/generated/matmul_i4.c +++ b/libgfortran/generated/matmul_i4.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_i4 (gfc_array_i4 * retarray, gfc_array_i4 * a, gfc_array_i4 * b) { GFC_INTEGER_4 *abase; GFC_INTEGER_4 *bbase; GFC_INTEGER_4 *dest; - GFC_INTEGER_4 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_INTEGER_4 *pa; - GFC_INTEGER_4 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_i4 (gfc_array_i4 * retarray, gfc_array_i4 * a, gfc_array_i4 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_i4 (gfc_array_i4 * retarray, gfc_array_i4 * a, gfc_array_i4 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_INTEGER_4 *bbase_y; + GFC_INTEGER_4 *dest_y; + GFC_INTEGER_4 *abase_n; + GFC_INTEGER_4 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_INTEGER_4) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_4)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/generated/matmul_i8.c b/libgfortran/generated/matmul_i8.c index 1a8e8dcb6b9..c84c0241301 100644 --- a/libgfortran/generated/matmul_i8.c +++ b/libgfortran/generated/matmul_i8.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_i8 (gfc_array_i8 * retarray, gfc_array_i8 * a, gfc_array_i8 * b) { GFC_INTEGER_8 *abase; GFC_INTEGER_8 *bbase; GFC_INTEGER_8 *dest; - GFC_INTEGER_8 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_INTEGER_8 *pa; - GFC_INTEGER_8 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_i8 (gfc_array_i8 * retarray, gfc_array_i8 * a, gfc_array_i8 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_i8 (gfc_array_i8 * retarray, gfc_array_i8 * a, gfc_array_i8 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_INTEGER_8 *bbase_y; + GFC_INTEGER_8 *dest_y; + GFC_INTEGER_8 *abase_n; + GFC_INTEGER_8 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_INTEGER_8) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_INTEGER_8)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/generated/matmul_r4.c b/libgfortran/generated/matmul_r4.c index 7d111369b12..6896a2e9bef 100644 --- a/libgfortran/generated/matmul_r4.c +++ b/libgfortran/generated/matmul_r4.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_r4 (gfc_array_r4 * retarray, gfc_array_r4 * a, gfc_array_r4 * b) { GFC_REAL_4 *abase; GFC_REAL_4 *bbase; GFC_REAL_4 *dest; - GFC_REAL_4 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_REAL_4 *pa; - GFC_REAL_4 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_r4 (gfc_array_r4 * retarray, gfc_array_r4 * a, gfc_array_r4 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_r4 (gfc_array_r4 * retarray, gfc_array_r4 * a, gfc_array_r4 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_REAL_4 *bbase_y; + GFC_REAL_4 *dest_y; + GFC_REAL_4 *abase_n; + GFC_REAL_4 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_REAL_4) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_4)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/generated/matmul_r8.c b/libgfortran/generated/matmul_r8.c index 5ab66fe073d..f0fc1a64a85 100644 --- a/libgfortran/generated/matmul_r8.c +++ b/libgfortran/generated/matmul_r8.c @@ -21,37 +21,46 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h" -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void __matmul_r8 (gfc_array_r8 * retarray, gfc_array_r8 * a, gfc_array_r8 * b) { GFC_REAL_8 *abase; GFC_REAL_8 *bbase; GFC_REAL_8 *dest; - GFC_REAL_8 res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - GFC_REAL_8 *pa; - GFC_REAL_8 *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -95,8 +104,10 @@ __matmul_r8 (gfc_array_r8 * retarray, gfc_array_r8 * a, gfc_array_r8 * b) if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -104,65 +115,88 @@ __matmul_r8 (gfc_array_r8 * retarray, gfc_array_r8 * a, gfc_array_r8 * b) rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + GFC_REAL_8 *bbase_y; + GFC_REAL_8 *dest_y; + GFC_REAL_8 *abase_n; + GFC_REAL_8 bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (GFC_REAL_8) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (GFC_REAL_8)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } } diff --git a/libgfortran/m4/matmul.m4 b/libgfortran/m4/matmul.m4 index 7a54b05595c..0602be69061 100644 --- a/libgfortran/m4/matmul.m4 +++ b/libgfortran/m4/matmul.m4 @@ -21,38 +21,47 @@ Boston, MA 02111-1307, USA. */ #include "config.h" #include +#include #include #include "libgfortran.h"' include(iparm.m4)dnl -/* Dimensions: retarray(x,y) a(x, count) b(count,y). - Either a or b can be rank 1. In this case x or y is 1. */ +/* This is a C version of the following fortran pseudo-code. The key + point is the loop order -- we access all arrays column-first, which + improves the performance enough to boost galgel spec score by 50%. + + DIMENSION A(M,COUNT), B(COUNT,N), C(M,N) + C = 0 + DO J=1,N + DO K=1,COUNT + DO I=1,M + C(I,J) = C(I,J)+A(I,K)*B(K,J) +*/ + void `__matmul_'rtype_code (rtype * retarray, rtype * a, rtype * b) { rtype_name *abase; rtype_name *bbase; rtype_name *dest; - rtype_name res; - index_type rxstride; - index_type rystride; - index_type xcount; - index_type ycount; - index_type xstride; - index_type ystride; - index_type x; - index_type y; - rtype_name *pa; - rtype_name *pb; - index_type astride; - index_type bstride; - index_type count; - index_type n; + index_type rxstride, rystride, axstride, aystride, bxstride, bystride; + index_type x, y, n, count, xcount, ycount; assert (GFC_DESCRIPTOR_RANK (a) == 2 || GFC_DESCRIPTOR_RANK (b) == 2); +/* C[xcount,ycount] = A[xcount, count] * B[count,ycount] + + Either A or B (but not both) can be rank 1: + + o One-dimensional argument A is implicitly treated as a row matrix + dimensioned [1,count], so xcount=1. + + o One-dimensional argument B is implicitly treated as a column matrix + dimensioned [count, 1], so ycount=1. + */ + if (retarray->data == NULL) { if (GFC_DESCRIPTOR_RANK (a) == 1) @@ -97,8 +106,10 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl if (GFC_DESCRIPTOR_RANK (retarray) == 1) { - rxstride = retarray->dim[0].stride; - rystride = rxstride; + /* One-dimensional result may be addressed in the code below + either as a row or a column matrix. We want both cases to + work. */ + rxstride = rystride = retarray->dim[0].stride; } else { @@ -106,65 +117,88 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl rystride = retarray->dim[1].stride; } - /* If we have rank 1 parameters, zero the absent stride, and set the size to - one. */ + if (GFC_DESCRIPTOR_RANK (a) == 1) { - astride = a->dim[0].stride; - count = a->dim[0].ubound + 1 - a->dim[0].lbound; - xstride = 0; - rxstride = 0; + /* Treat it as a a row matrix A[1,count]. */ + axstride = a->dim[0].stride; + aystride = 1; + xcount = 1; + count = a->dim[0].ubound + 1 - a->dim[0].lbound; } else { - astride = a->dim[1].stride; + axstride = a->dim[0].stride; + aystride = a->dim[1].stride; + count = a->dim[1].ubound + 1 - a->dim[1].lbound; - xstride = a->dim[0].stride; xcount = a->dim[0].ubound + 1 - a->dim[0].lbound; } + + assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); + if (GFC_DESCRIPTOR_RANK (b) == 1) { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = 0; - rystride = 0; + /* Treat it as a column matrix B[count,1] */ + bxstride = b->dim[0].stride; + + /* bystride should never be used for 1-dimensional b. + in case it is we want it to cause a segfault, rather than + an incorrect result. */ + bystride = 0xDEADBEEF; ycount = 1; } else { - bstride = b->dim[0].stride; - assert(count == b->dim[0].ubound + 1 - b->dim[0].lbound); - ystride = b->dim[1].stride; + bxstride = b->dim[0].stride; + bystride = b->dim[1].stride; ycount = b->dim[1].ubound + 1 - b->dim[1].lbound; } - for (y = 0; y < ycount; y++) + assert (a->base == 0); + assert (b->base == 0); + assert (retarray->base == 0); + + abase = a->data; + bbase = b->data; + dest = retarray->data; + + if (rxstride == 1 && axstride == 1 && bxstride == 1) { - for (x = 0; x < xcount; x++) - { - /* Do the summation for this element. For real and integer types - this is the same as DOT_PRODUCT. For complex types we use do - a*b, not conjg(a)*b. */ - pa = abase; - pb = bbase; - res = 0; + rtype_name *bbase_y; + rtype_name *dest_y; + rtype_name *abase_n; + rtype_name bbase_yn; - for (n = 0; n < count; n++) - { - res += *pa * *pb; - pa += astride; - pb += bstride; - } + memset (dest, 0, (sizeof (rtype_name) * size0(retarray))); - *dest = res; + for (y = 0; y < ycount; y++) + { + bbase_y = bbase + y*bystride; + dest_y = dest + y*rystride; + for (n = 0; n < count; n++) + { + abase_n = abase + n*aystride; + bbase_yn = bbase_y[n]; + for (x = 0; x < xcount; x++) + { + dest_y[x] += abase_n[x] * bbase_yn; + } + } + } + } + else + { + for (y = 0; y < ycount; y++) + for (x = 0; x < xcount; x++) + dest[x*rxstride + y*rystride] = (rtype_name)0; - dest += rxstride; - abase += xstride; - } - abase -= xstride * xcount; - bbase += ystride; - dest += rystride - (rxstride * xcount); + for (y = 0; y < ycount; y++) + for (n = 0; n < count; n++) + for (x = 0; x < xcount; x++) + /* dest[x,y] += a[x,n] * b[n,y] */ + dest[x*rxstride + y*rystride] += abase[x*axstride + n*aystride] * bbase[n*bxstride + y*bystride]; } }