tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
* tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer. (SET_DR_MISALIGNMENT): New. * tree-vect-analyze.c (vect_compute_data_ref_alignment, vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment): Use SET_DR_MISALIGNMENT. * tree-predcom.c (split_data_refs_to_components): Cast dr->aux from pointer. * tree-data-ref.c (create_data_ref, compute_all_dependences, find_loop_nest): Export. * tree-data-ref.h (struct data_reference): Change aux field to pointer. (create_data_ref, compute_all_dependences, find_loop_nest): Declare. * tree-ssa-loop-prefetch.c: Include tree-data-ref.h. (L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION): New macros. (struct mem_ref): Add field reuse_distance. (find_or_create_group, record_ref): Use XNEW instead of xcalloc. Initialize reuse_distance field. (issue_prefetch_ref): Select temporality of prefetch according to reuse_distance. (volume_of_references, volume_of_dist_vector, add_subscript_strides, self_reuse_distance, determine_loop_nest_reuse): New functions. (loop_prefetch_arrays): Call determine_loop_nest_reuse. (tree_ssa_prefetch_arrays): Dump L2 cache size. * Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H dependency. * gcc.dg/tree-ssa/prefetch-6.c: New test. From-SVN: r125172
This commit is contained in:
parent
cd5ecab6a7
commit
5417e0224b
@ -1,9 +1,37 @@
|
||||
2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
|
||||
(SET_DR_MISALIGNMENT): New.
|
||||
* tree-vect-analyze.c (vect_compute_data_ref_alignment,
|
||||
vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment):
|
||||
Use SET_DR_MISALIGNMENT.
|
||||
* tree-predcom.c (split_data_refs_to_components): Cast dr->aux from
|
||||
pointer.
|
||||
* tree-data-ref.c (create_data_ref, compute_all_dependences,
|
||||
find_loop_nest): Export.
|
||||
* tree-data-ref.h (struct data_reference): Change aux field to pointer.
|
||||
(create_data_ref, compute_all_dependences, find_loop_nest): Declare.
|
||||
* tree-ssa-loop-prefetch.c: Include tree-data-ref.h.
|
||||
(L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION):
|
||||
New macros.
|
||||
(struct mem_ref): Add field reuse_distance.
|
||||
(find_or_create_group, record_ref): Use XNEW instead of xcalloc.
|
||||
Initialize reuse_distance field.
|
||||
(issue_prefetch_ref): Select temporality of prefetch according to
|
||||
reuse_distance.
|
||||
(volume_of_references, volume_of_dist_vector, add_subscript_strides,
|
||||
self_reuse_distance, determine_loop_nest_reuse): New functions.
|
||||
(loop_prefetch_arrays): Call determine_loop_nest_reuse.
|
||||
(tree_ssa_prefetch_arrays): Dump L2 cache size.
|
||||
* Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H
|
||||
dependency.
|
||||
|
||||
2007-05-29 Daniel Berlin <dberlin@dberlin.org>
|
||||
|
||||
* tree-ssa-alias.c: Add aliasing overview.
|
||||
|
||||
2007-05-29 Zuxy Meng <zuxy.meng@gmail.com>
|
||||
Danny Smith <dannysmith@users.sourceforge.net>
|
||||
Danny Smith <dannysmith@users.sourceforge.net>
|
||||
|
||||
PR target/29498
|
||||
* config/i386/t-crtfm: Compile crtfastmath.o with
|
||||
|
@ -2084,7 +2084,7 @@ tree-ssa-loop-prefetch.o: tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \
|
||||
output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
|
||||
tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H) \
|
||||
$(CFGLOOP_H) $(PARAMS_H) langhooks.h $(BASIC_BLOCK_H) hard-reg-set.h \
|
||||
tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H)
|
||||
tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H) $(TREE_DATA_REF_H)
|
||||
tree-predcom.o: tree-predcom.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_P_H) \
|
||||
$(CFGLOOP_H) $(TREE_FLOW_H) $(GGC_H) $(TREE_DATA_REF_H) $(SCEV_H) \
|
||||
$(PARAMS_H) $(DIAGNOSTIC_H) tree-pass.h $(TM_H) coretypes.h tree-affine.h \
|
||||
|
@ -1,3 +1,7 @@
|
||||
2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
|
||||
|
||||
* gcc.dg/tree-ssa/prefetch-6.c: New test.
|
||||
|
||||
2007-05-29 Tobias Schlüter <tobi@gcc.gnu.org>
|
||||
|
||||
* gfortran.dg/sizeof.f90: New.
|
||||
|
54
gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c
Normal file
54
gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c
Normal file
@ -0,0 +1,54 @@
|
||||
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
|
||||
/* { dg-require-effective-target ilp32 } */
|
||||
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse --param simultaneous-prefetches=100 -fdump-tree-aprefetch-details" } */
|
||||
|
||||
#define N 1000
|
||||
#define K 900
|
||||
|
||||
double a[N][N];
|
||||
|
||||
double test(void)
|
||||
{
|
||||
unsigned i, j;
|
||||
double sum = 0;
|
||||
|
||||
/* Here, we should use non-temporal prefetch instruction. */
|
||||
for (i = 0; i < K; i++)
|
||||
for (j = 0; j < K; j++)
|
||||
sum += a[i][j];
|
||||
|
||||
/* Here, we should not use non-temporal prefetch instruction, since the
|
||||
value of a[i+10][j] is reused in L2 cache. */
|
||||
for (i = 0; i < K; i++)
|
||||
for (j = 0; j < K; j++)
|
||||
sum += a[i][j] * a[i + 10][j];
|
||||
|
||||
/* Here, we should use non-temporal prefetch instruction, since the
|
||||
value of a[i+100][j] is too far to be reused in L2 cache. */
|
||||
for (i = 0; i < K; i++)
|
||||
for (j = 0; j < K; j++)
|
||||
sum += a[i][j] * a[i + 100][j];
|
||||
|
||||
/* Here, temporal prefetches should be used, since the volume of the
|
||||
memory accesses is smaller than L2 cache. */
|
||||
for (i = 0; i < 100; i++)
|
||||
for (j = 0; j < 100; j++)
|
||||
sum += a[i][j] * a[i + 100][j];
|
||||
|
||||
/* Temporal prefetches should be used here (even though the accesses to
|
||||
a[j][i] are independent, the same cache line is almost always hit
|
||||
every N iterations). */
|
||||
for (i = 0; i < N; i++)
|
||||
for (j = 0; j < N; j++)
|
||||
sum += a[j][i];
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "Issued prefetch" 5 "aprefetch" } } */
|
||||
/* { dg-final { scan-tree-dump-times "Issued nontemporal prefetch" 3 "aprefetch" } } */
|
||||
|
||||
/* { dg-final { scan-assembler-times "prefetcht" 5 } } */
|
||||
/* { dg-final { scan-assembler-times "prefetchnta" 3 } } */
|
||||
|
||||
/* { dg-final { cleanup-tree-dump "aprefetch" } } */
|
@ -771,7 +771,7 @@ free_data_ref (data_reference_p dr)
|
||||
data_reference description of MEMREF. NEST is the outermost loop of the
|
||||
loop nest in that the reference should be analysed. */
|
||||
|
||||
static struct data_reference *
|
||||
struct data_reference *
|
||||
create_data_ref (struct loop *nest, tree memref, tree stmt, bool is_read)
|
||||
{
|
||||
struct data_reference *dr;
|
||||
@ -3843,7 +3843,7 @@ compute_self_dependence (struct data_dependence_relation *ddr)
|
||||
COMPUTE_SELF_AND_RR is FALSE, don't compute read-read and self
|
||||
relations. */
|
||||
|
||||
static void
|
||||
void
|
||||
compute_all_dependences (VEC (data_reference_p, heap) *datarefs,
|
||||
VEC (ddr_p, heap) **dependence_relations,
|
||||
VEC (loop_p, heap) *loop_nest,
|
||||
@ -4055,7 +4055,7 @@ find_loop_nest_1 (struct loop *loop, VEC (loop_p, heap) **loop_nest)
|
||||
contain the loops from the outermost to the innermost, as they will
|
||||
appear in the classic distance vector. */
|
||||
|
||||
static bool
|
||||
bool
|
||||
find_loop_nest (struct loop *loop, VEC (loop_p, heap) **loop_nest)
|
||||
{
|
||||
VEC_safe_push (loop_p, heap, *loop_nest, loop);
|
||||
|
@ -104,7 +104,7 @@ struct data_reference
|
||||
tree ref;
|
||||
|
||||
/* Auxiliary info specific to a pass. */
|
||||
int aux;
|
||||
void *aux;
|
||||
|
||||
/* True when the data reference is in RHS of a stmt. */
|
||||
bool is_read;
|
||||
@ -320,7 +320,10 @@ extern void dump_data_dependence_direction (FILE *,
|
||||
extern void free_dependence_relation (struct data_dependence_relation *);
|
||||
extern void free_dependence_relations (VEC (ddr_p, heap) *);
|
||||
extern void free_data_refs (VEC (data_reference_p, heap) *);
|
||||
|
||||
struct data_reference *create_data_ref (struct loop *, tree, tree, bool);
|
||||
bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
|
||||
void compute_all_dependences (VEC (data_reference_p, heap) *,
|
||||
VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool);
|
||||
|
||||
/* Return the index of the variable VAR in the LOOP_NEST array. */
|
||||
|
||||
|
@ -700,7 +700,7 @@ split_data_refs_to_components (struct loop *loop,
|
||||
just fail. */
|
||||
goto end;
|
||||
}
|
||||
dr->aux = i;
|
||||
dr->aux = (void *) (size_t) i;
|
||||
comp_father[i] = i;
|
||||
comp_size[i] = 1;
|
||||
}
|
||||
@ -715,7 +715,7 @@ split_data_refs_to_components (struct loop *loop,
|
||||
|
||||
if (!suitable_reference_p (dr, &dummy))
|
||||
{
|
||||
ia = dr->aux;
|
||||
ia = (unsigned) (size_t) dr->aux;
|
||||
merge_comps (comp_father, comp_size, n, ia);
|
||||
}
|
||||
}
|
||||
@ -729,8 +729,8 @@ split_data_refs_to_components (struct loop *loop,
|
||||
|
||||
dra = DDR_A (ddr);
|
||||
drb = DDR_B (ddr);
|
||||
ia = component_of (comp_father, dra->aux);
|
||||
ib = component_of (comp_father, drb->aux);
|
||||
ia = component_of (comp_father, (unsigned) (size_t) dra->aux);
|
||||
ib = component_of (comp_father, (unsigned) (size_t) drb->aux);
|
||||
if (ia == ib)
|
||||
continue;
|
||||
|
||||
@ -749,7 +749,7 @@ split_data_refs_to_components (struct loop *loop,
|
||||
bad = component_of (comp_father, n);
|
||||
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
|
||||
{
|
||||
ia = dr->aux;
|
||||
ia = (unsigned) (size_t) dr->aux;
|
||||
ca = component_of (comp_father, ia);
|
||||
if (ca == bad)
|
||||
continue;
|
||||
|
@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
#include "params.h"
|
||||
#include "langhooks.h"
|
||||
#include "tree-inline.h"
|
||||
#include "tree-data-ref.h"
|
||||
|
||||
/* This pass inserts prefetch instructions to optimize cache usage during
|
||||
accesses to arrays in loops. It processes loops sequentially and:
|
||||
@ -82,6 +83,10 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
7/32.
|
||||
(5) has PREFETCH_MOD 1 as well.
|
||||
|
||||
Additionally, we use data dependence analysis to determine for each
|
||||
reference the distance till the first reuse; this information is used
|
||||
to determine the temporality of the issued prefetch instruction.
|
||||
|
||||
3) We determine how much ahead we need to prefetch. The number of
|
||||
iterations needed is time to fetch / time spent in one iteration of
|
||||
the loop. The problem is that we do not know either of these values,
|
||||
@ -161,6 +166,17 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
|
||||
#define HAVE_prefetch 0
|
||||
#endif
|
||||
|
||||
#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * L1_CACHE_LINE_SIZE))
|
||||
/* TODO: Add parameter to specify L2 cache size. */
|
||||
#define L2_CACHE_SIZE_BYTES (8 * L1_CACHE_SIZE_BYTES)
|
||||
|
||||
/* We consider a memory access nontemporal if it is not reused sooner than
|
||||
after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
|
||||
accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
|
||||
so that we use nontemporal prefetches e.g. if single memory location
|
||||
is accessed several times in a single iteration of the loop. */
|
||||
#define NONTEMPORAL_FRACTION 16
|
||||
|
||||
/* The group of references between that reuse may occur. */
|
||||
|
||||
struct mem_ref_group
|
||||
@ -190,6 +206,8 @@ struct mem_ref
|
||||
unsigned HOST_WIDE_INT prefetch_before;
|
||||
/* Prefetch only first PREFETCH_BEFORE
|
||||
iterations. */
|
||||
unsigned reuse_distance; /* The amount of data accessed before the first
|
||||
reuse of this value. */
|
||||
bool issue_prefetch_p; /* Should we really issue the prefetch? */
|
||||
struct mem_ref *next; /* The next reference in the group. */
|
||||
};
|
||||
@ -236,7 +254,7 @@ find_or_create_group (struct mem_ref_group **groups, tree base,
|
||||
break;
|
||||
}
|
||||
|
||||
group = xcalloc (1, sizeof (struct mem_ref_group));
|
||||
group = XNEW (struct mem_ref_group);
|
||||
group->base = base;
|
||||
group->step = step;
|
||||
group->refs = NULL;
|
||||
@ -273,13 +291,14 @@ record_ref (struct mem_ref_group *group, tree stmt, tree mem,
|
||||
return;
|
||||
}
|
||||
|
||||
(*aref) = xcalloc (1, sizeof (struct mem_ref));
|
||||
(*aref) = XNEW (struct mem_ref);
|
||||
(*aref)->stmt = stmt;
|
||||
(*aref)->mem = mem;
|
||||
(*aref)->delta = delta;
|
||||
(*aref)->write_p = write_p;
|
||||
(*aref)->prefetch_before = PREFETCH_ALL;
|
||||
(*aref)->prefetch_mod = 1;
|
||||
(*aref)->reuse_distance = 0;
|
||||
(*aref)->issue_prefetch_p = false;
|
||||
(*aref)->group = group;
|
||||
(*aref)->next = NULL;
|
||||
@ -815,12 +834,15 @@ static void
|
||||
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
||||
{
|
||||
HOST_WIDE_INT delta;
|
||||
tree addr, addr_base, prefetch, write_p;
|
||||
tree addr, addr_base, prefetch, write_p, local;
|
||||
block_stmt_iterator bsi;
|
||||
unsigned n_prefetches, ap;
|
||||
bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
|
||||
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
|
||||
fprintf (dump_file, "Issued%s prefetch for %p.\n",
|
||||
nontemporal ? " nontemporal" : "",
|
||||
(void *) ref);
|
||||
|
||||
bsi = bsi_for_stmt (ref->stmt);
|
||||
|
||||
@ -829,6 +851,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
||||
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
|
||||
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
|
||||
write_p = ref->write_p ? integer_one_node : integer_zero_node;
|
||||
local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
|
||||
|
||||
for (ap = 0; ap < n_prefetches; ap++)
|
||||
{
|
||||
@ -840,7 +863,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
|
||||
|
||||
/* Create the prefetch instruction. */
|
||||
prefetch = build_call_expr (built_in_decls[BUILT_IN_PREFETCH],
|
||||
2, addr, write_p);
|
||||
3, addr, write_p, local);
|
||||
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
|
||||
}
|
||||
}
|
||||
@ -935,6 +958,311 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
|
||||
return factor;
|
||||
}
|
||||
|
||||
/* Returns the total volume of the memory references REFS, taking into account
|
||||
reuses in the innermost loop and cache line size. TODO -- we should also
|
||||
take into account reuses across the iterations of the loops in the loop
|
||||
nest. */
|
||||
|
||||
static unsigned
|
||||
volume_of_references (struct mem_ref_group *refs)
|
||||
{
|
||||
unsigned volume = 0;
|
||||
struct mem_ref_group *gr;
|
||||
struct mem_ref *ref;
|
||||
|
||||
for (gr = refs; gr; gr = gr->next)
|
||||
for (ref = gr->refs; ref; ref = ref->next)
|
||||
{
|
||||
/* Almost always reuses another value? */
|
||||
if (ref->prefetch_before != PREFETCH_ALL)
|
||||
continue;
|
||||
|
||||
/* If several iterations access the same cache line, use the size of
|
||||
the line divided by this number. Otherwise, a cache line is
|
||||
accessed in each iteration. TODO -- in the latter case, we should
|
||||
take the size of the reference into account, rounding it up on cache
|
||||
line size multiple. */
|
||||
volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
|
||||
}
|
||||
return volume;
|
||||
}
|
||||
|
||||
/* Returns the volume of memory references accessed across VEC iterations of
|
||||
loops, whose sizes are described in the LOOP_SIZES array. N is the number
|
||||
of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
|
||||
|
||||
static unsigned
|
||||
volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < n; i++)
|
||||
if (vec[i] != 0)
|
||||
break;
|
||||
|
||||
if (i == n)
|
||||
return 0;
|
||||
|
||||
gcc_assert (vec[i] > 0);
|
||||
|
||||
/* We ignore the parts of the distance vector in subloops, since usually
|
||||
the numbers of iterations are much smaller. */
|
||||
return loop_sizes[i] * vec[i];
|
||||
}
|
||||
|
||||
/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
|
||||
at the position corresponding to the loop of the step. N is the depth
|
||||
of the considered loop nest, and, LOOP is its innermost loop. */
|
||||
|
||||
static void
|
||||
add_subscript_strides (tree access_fn, unsigned stride,
|
||||
HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
|
||||
{
|
||||
struct loop *aloop;
|
||||
tree step;
|
||||
HOST_WIDE_INT astep;
|
||||
unsigned min_depth = loop_depth (loop) - n;
|
||||
|
||||
while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
|
||||
{
|
||||
aloop = get_chrec_loop (access_fn);
|
||||
step = CHREC_RIGHT (access_fn);
|
||||
access_fn = CHREC_LEFT (access_fn);
|
||||
|
||||
if ((unsigned) loop_depth (aloop) <= min_depth)
|
||||
continue;
|
||||
|
||||
if (host_integerp (step, 0))
|
||||
astep = tree_low_cst (step, 0);
|
||||
else
|
||||
astep = L1_CACHE_LINE_SIZE;
|
||||
|
||||
strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns the volume of memory references accessed between two consecutive
|
||||
self-reuses of the reference DR. We consider the subscripts of DR in N
|
||||
loops, and LOOP_SIZES contains the volumes of accesses in each of the
|
||||
loops. LOOP is the innermost loop of the current loop nest. */
|
||||
|
||||
static unsigned
|
||||
self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
|
||||
struct loop *loop)
|
||||
{
|
||||
tree stride, access_fn;
|
||||
HOST_WIDE_INT *strides, astride;
|
||||
VEC (tree, heap) *access_fns;
|
||||
tree ref = DR_REF (dr);
|
||||
unsigned i, ret = ~0u;
|
||||
|
||||
/* In the following example:
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
for (j = 0; j < N; j++)
|
||||
use (a[j][i]);
|
||||
the same cache line is accessed each N steps (except if the change from
|
||||
i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
|
||||
we cannot rely purely on the results of the data dependence analysis.
|
||||
|
||||
Instead, we compute the stride of the reference in each loop, and consider
|
||||
the innermost loop in that the stride is less than cache size. */
|
||||
|
||||
strides = XCNEWVEC (HOST_WIDE_INT, n);
|
||||
access_fns = DR_ACCESS_FNS (dr);
|
||||
|
||||
for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
|
||||
{
|
||||
/* Keep track of the reference corresponding to the subscript, so that we
|
||||
know its stride. */
|
||||
while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
|
||||
ref = TREE_OPERAND (ref, 0);
|
||||
|
||||
if (TREE_CODE (ref) == ARRAY_REF)
|
||||
{
|
||||
stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
|
||||
if (host_integerp (stride, 1))
|
||||
astride = tree_low_cst (stride, 1);
|
||||
else
|
||||
astride = L1_CACHE_LINE_SIZE;
|
||||
|
||||
ref = TREE_OPERAND (ref, 0);
|
||||
}
|
||||
else
|
||||
astride = 1;
|
||||
|
||||
add_subscript_strides (access_fn, astride, strides, n, loop);
|
||||
}
|
||||
|
||||
for (i = n; i-- > 0; )
|
||||
{
|
||||
unsigned HOST_WIDE_INT s;
|
||||
|
||||
s = strides[i] < 0 ? -strides[i] : strides[i];
|
||||
|
||||
if (s < (unsigned) L1_CACHE_LINE_SIZE
|
||||
&& (loop_sizes[i]
|
||||
> (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
|
||||
{
|
||||
ret = loop_sizes[i];
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
free (strides);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Determines the distance till the first reuse of each reference in REFS
|
||||
in the loop nest of LOOP. */
|
||||
|
||||
static void
|
||||
determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs)
|
||||
{
|
||||
struct loop *nest, *aloop;
|
||||
VEC (data_reference_p, heap) *datarefs = NULL;
|
||||
VEC (ddr_p, heap) *dependences = NULL;
|
||||
struct mem_ref_group *gr;
|
||||
struct mem_ref *ref;
|
||||
VEC (loop_p, heap) *vloops = NULL;
|
||||
unsigned *loop_data_size;
|
||||
unsigned i, j, n;
|
||||
unsigned volume, dist, adist;
|
||||
HOST_WIDE_INT vol;
|
||||
data_reference_p dr;
|
||||
ddr_p dep;
|
||||
|
||||
if (loop->inner)
|
||||
return;
|
||||
|
||||
/* Find the outermost loop of the loop nest of loop (we require that
|
||||
there are no sibling loops inside the nest). */
|
||||
nest = loop;
|
||||
while (1)
|
||||
{
|
||||
aloop = loop_outer (nest);
|
||||
|
||||
if (aloop == current_loops->tree_root
|
||||
|| aloop->inner->next)
|
||||
break;
|
||||
|
||||
nest = aloop;
|
||||
}
|
||||
|
||||
/* For each loop, determine the amount of data accessed in each iteration.
|
||||
We use this to estimate whether the reference is evicted from the
|
||||
cache before its reuse. */
|
||||
find_loop_nest (nest, &vloops);
|
||||
n = VEC_length (loop_p, vloops);
|
||||
loop_data_size = XNEWVEC (unsigned, n);
|
||||
volume = volume_of_references (refs);
|
||||
i = n;
|
||||
while (i-- != 0)
|
||||
{
|
||||
loop_data_size[i] = volume;
|
||||
/* Bound the volume by the L2 cache size, since above this bound,
|
||||
all dependence distances are equivalent. */
|
||||
if (volume > L2_CACHE_SIZE_BYTES)
|
||||
continue;
|
||||
|
||||
aloop = VEC_index (loop_p, vloops, i);
|
||||
vol = estimated_loop_iterations_int (aloop, false);
|
||||
if (vol < 0)
|
||||
vol = expected_loop_iterations (aloop);
|
||||
volume *= vol;
|
||||
}
|
||||
|
||||
/* Prepare the references in the form suitable for data dependence
|
||||
analysis. We ignore unanalysable data references (the results
|
||||
are used just as a heuristics to estimate temporality of the
|
||||
references, hence we do not need to worry about correctness). */
|
||||
for (gr = refs; gr; gr = gr->next)
|
||||
for (ref = gr->refs; ref; ref = ref->next)
|
||||
{
|
||||
dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
|
||||
|
||||
if (dr)
|
||||
{
|
||||
ref->reuse_distance = volume;
|
||||
dr->aux = ref;
|
||||
VEC_safe_push (data_reference_p, heap, datarefs, dr);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
|
||||
{
|
||||
dist = self_reuse_distance (dr, loop_data_size, n, loop);
|
||||
ref = dr->aux;
|
||||
if (ref->reuse_distance > dist)
|
||||
ref->reuse_distance = dist;
|
||||
}
|
||||
|
||||
compute_all_dependences (datarefs, &dependences, vloops, true);
|
||||
|
||||
for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
|
||||
{
|
||||
if (DDR_ARE_DEPENDENT (dep) == chrec_known)
|
||||
continue;
|
||||
|
||||
if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
|
||||
|| DDR_NUM_DIST_VECTS (dep) == 0)
|
||||
{
|
||||
/* If the dependence cannot be analysed, assume that there might be
|
||||
a reuse. */
|
||||
dist = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* The distance vectors are normalised to be always lexicographically
|
||||
positive, hence we cannot tell just from them whether DDR_A comes
|
||||
before DDR_B or vice versa. However, it is not important,
|
||||
anyway -- if DDR_A is close to DDR_B, then it is either reused in
|
||||
DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
|
||||
in cache (and marking it as nontemporal would not affect
|
||||
anything). */
|
||||
|
||||
dist = volume;
|
||||
for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
|
||||
{
|
||||
adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
|
||||
loop_data_size, n);
|
||||
|
||||
/* Ignore accesses closer than
|
||||
L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
|
||||
so that we use nontemporal prefetches e.g. if single memory
|
||||
location is accessed several times in a single iteration of
|
||||
the loop. */
|
||||
if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
|
||||
continue;
|
||||
|
||||
if (adist < dist)
|
||||
dist = adist;
|
||||
}
|
||||
}
|
||||
|
||||
ref = DDR_A (dep)->aux;
|
||||
if (ref->reuse_distance > dist)
|
||||
ref->reuse_distance = dist;
|
||||
ref = DDR_B (dep)->aux;
|
||||
if (ref->reuse_distance > dist)
|
||||
ref->reuse_distance = dist;
|
||||
}
|
||||
|
||||
free_dependence_relations (dependences);
|
||||
free_data_refs (datarefs);
|
||||
free (loop_data_size);
|
||||
|
||||
if (dump_file && (dump_flags & TDF_DETAILS))
|
||||
{
|
||||
fprintf (dump_file, "Reuse distances:\n");
|
||||
for (gr = refs; gr; gr = gr->next)
|
||||
for (ref = gr->refs; ref; ref = ref->next)
|
||||
fprintf (dump_file, " ref %p distance %u\n",
|
||||
(void *) ref, ref->reuse_distance);
|
||||
}
|
||||
}
|
||||
|
||||
/* Issue prefetch instructions for array references in LOOP. Returns
|
||||
true if the LOOP was unrolled. */
|
||||
|
||||
@ -963,6 +1291,8 @@ loop_prefetch_arrays (struct loop *loop)
|
||||
if (!anything_to_prefetch_p (refs))
|
||||
goto fail;
|
||||
|
||||
determine_loop_nest_reuse (loop, refs);
|
||||
|
||||
/* Step 3: determine the ahead and unroll factor. */
|
||||
|
||||
/* FIXME: the time should be weighted by the probabilities of the blocks in
|
||||
@ -1034,10 +1364,11 @@ tree_ssa_prefetch_arrays (void)
|
||||
fprintf (dump_file, " simultaneous prefetches: %d\n",
|
||||
SIMULTANEOUS_PREFETCHES);
|
||||
fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
|
||||
fprintf (dump_file, " L1 cache size: %d (%d bytes)\n",
|
||||
L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE);
|
||||
fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
|
||||
fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
|
||||
fprintf (dump_file, " L1 cache size: %d lines, %d bytes\n",
|
||||
L1_CACHE_SIZE, L1_CACHE_SIZE_BYTES);
|
||||
fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
|
||||
fprintf (dump_file, " L2 cache size: %d bytes\n", L2_CACHE_SIZE_BYTES);
|
||||
fprintf (dump_file, "\n");
|
||||
}
|
||||
|
||||
|
@ -1128,7 +1128,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
|
||||
fprintf (vect_dump, "vect_compute_data_ref_alignment:");
|
||||
|
||||
/* Initialize misalignment to unknown. */
|
||||
DR_MISALIGNMENT (dr) = -1;
|
||||
SET_DR_MISALIGNMENT (dr, -1);
|
||||
|
||||
misalign = DR_INIT (dr);
|
||||
aligned_to = DR_ALIGNED_TO (dr);
|
||||
@ -1198,7 +1198,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
|
||||
return false;
|
||||
}
|
||||
|
||||
DR_MISALIGNMENT (dr) = TREE_INT_CST_LOW (misalign);
|
||||
SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
|
||||
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
@ -1267,21 +1267,23 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
|
||||
continue;
|
||||
gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
|
||||
DR_MISALIGNMENT (dr_peel) / dr_peel_size);
|
||||
DR_MISALIGNMENT (dr) = 0;
|
||||
SET_DR_MISALIGNMENT (dr, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
if (known_alignment_for_access_p (dr)
|
||||
&& known_alignment_for_access_p (dr_peel))
|
||||
{
|
||||
DR_MISALIGNMENT (dr) += npeel * dr_size;
|
||||
DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD;
|
||||
int misal = DR_MISALIGNMENT (dr);
|
||||
misal += npeel * dr_size;
|
||||
misal %= UNITS_PER_SIMD_WORD;
|
||||
SET_DR_MISALIGNMENT (dr, misal);
|
||||
return;
|
||||
}
|
||||
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
fprintf (vect_dump, "Setting misalignment to -1.");
|
||||
DR_MISALIGNMENT (dr) = -1;
|
||||
SET_DR_MISALIGNMENT (dr, -1);
|
||||
}
|
||||
|
||||
|
||||
@ -1577,7 +1579,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
|
||||
save_misalignment = DR_MISALIGNMENT (dr);
|
||||
vect_update_misalignment_for_peel (dr, dr0, npeel);
|
||||
supportable_dr_alignment = vect_supportable_dr_alignment (dr);
|
||||
DR_MISALIGNMENT (dr) = save_misalignment;
|
||||
SET_DR_MISALIGNMENT (dr, save_misalignment);
|
||||
|
||||
if (!supportable_dr_alignment)
|
||||
{
|
||||
@ -1601,7 +1603,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
|
||||
|
||||
LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
|
||||
LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
|
||||
DR_MISALIGNMENT (dr0) = 0;
|
||||
SET_DR_MISALIGNMENT (dr0, 0);
|
||||
if (vect_print_dump_info (REPORT_ALIGNMENT))
|
||||
fprintf (vect_dump, "Alignment of access forced using peeling.");
|
||||
|
||||
@ -1702,7 +1704,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
|
||||
{
|
||||
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
||||
dr = STMT_VINFO_DATA_REF (stmt_info);
|
||||
DR_MISALIGNMENT (dr) = 0;
|
||||
SET_DR_MISALIGNMENT (dr, 0);
|
||||
if (vect_print_dump_info (REPORT_ALIGNMENT))
|
||||
fprintf (vect_dump, "Alignment of access forced using versioning.");
|
||||
}
|
||||
|
@ -339,7 +339,8 @@ is_pattern_stmt_p (stmt_vec_info stmt_info)
|
||||
|
||||
/* Reflects actual alignment of first access in the vectorized loop,
|
||||
taking into account peeling/versioning if applied. */
|
||||
#define DR_MISALIGNMENT(DR) (DR)->aux
|
||||
#define DR_MISALIGNMENT(DR) ((int) (size_t) (DR)->aux)
|
||||
#define SET_DR_MISALIGNMENT(DR, VAL) ((DR)->aux = (void *) (size_t) (VAL))
|
||||
|
||||
static inline bool
|
||||
aligned_access_p (struct data_reference *data_ref_info)
|
||||
|
Loading…
Reference in New Issue
Block a user