tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.

* tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
	(SET_DR_MISALIGNMENT): New.
	* tree-vect-analyze.c (vect_compute_data_ref_alignment,
	vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment):
	Use SET_DR_MISALIGNMENT.
	* tree-predcom.c (split_data_refs_to_components): Cast dr->aux from
	pointer.
	* tree-data-ref.c (create_data_ref, compute_all_dependences,
	find_loop_nest): Export.
	* tree-data-ref.h (struct data_reference): Change aux field to pointer.
	(create_data_ref, compute_all_dependences, find_loop_nest): Declare.
	* tree-ssa-loop-prefetch.c: Include tree-data-ref.h.
	(L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION):
	New macros.
	(struct mem_ref): Add field reuse_distance.
	(find_or_create_group, record_ref): Use XNEW instead of xcalloc.
	Initialize reuse_distance field.
	(issue_prefetch_ref): Select temporality of prefetch according to
	reuse_distance.
	(volume_of_references, volume_of_dist_vector, add_subscript_strides,
	self_reuse_distance, determine_loop_nest_reuse): New functions.
	(loop_prefetch_arrays): Call determine_loop_nest_reuse.
	(tree_ssa_prefetch_arrays): Dump L2 cache size.
	* Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H
	dependency.

	* gcc.dg/tree-ssa/prefetch-6.c: New test.

From-SVN: r125172
This commit is contained in:
Zdenek Dvorak 2007-05-29 21:55:47 +00:00
parent cd5ecab6a7
commit 5417e0224b
10 changed files with 453 additions and 30 deletions

View File

@ -1,9 +1,37 @@
2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
* tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer.
(SET_DR_MISALIGNMENT): New.
* tree-vect-analyze.c (vect_compute_data_ref_alignment,
vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment):
Use SET_DR_MISALIGNMENT.
* tree-predcom.c (split_data_refs_to_components): Cast dr->aux from
pointer.
* tree-data-ref.c (create_data_ref, compute_all_dependences,
find_loop_nest): Export.
* tree-data-ref.h (struct data_reference): Change aux field to pointer.
(create_data_ref, compute_all_dependences, find_loop_nest): Declare.
* tree-ssa-loop-prefetch.c: Include tree-data-ref.h.
(L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION):
New macros.
(struct mem_ref): Add field reuse_distance.
(find_or_create_group, record_ref): Use XNEW instead of xcalloc.
Initialize reuse_distance field.
(issue_prefetch_ref): Select temporality of prefetch according to
reuse_distance.
(volume_of_references, volume_of_dist_vector, add_subscript_strides,
self_reuse_distance, determine_loop_nest_reuse): New functions.
(loop_prefetch_arrays): Call determine_loop_nest_reuse.
(tree_ssa_prefetch_arrays): Dump L2 cache size.
* Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H
dependency.
2007-05-29 Daniel Berlin <dberlin@dberlin.org>
* tree-ssa-alias.c: Add aliasing overview.
2007-05-29 Zuxy Meng <zuxy.meng@gmail.com>
Danny Smith <dannysmith@users.sourceforge.net>
Danny Smith <dannysmith@users.sourceforge.net>
PR target/29498
* config/i386/t-crtfm: Compile crtfastmath.o with

View File

@ -2084,7 +2084,7 @@ tree-ssa-loop-prefetch.o: tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \
output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \
tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H) \
$(CFGLOOP_H) $(PARAMS_H) langhooks.h $(BASIC_BLOCK_H) hard-reg-set.h \
tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H)
tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H) $(TREE_DATA_REF_H)
tree-predcom.o: tree-predcom.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_P_H) \
$(CFGLOOP_H) $(TREE_FLOW_H) $(GGC_H) $(TREE_DATA_REF_H) $(SCEV_H) \
$(PARAMS_H) $(DIAGNOSTIC_H) tree-pass.h $(TM_H) coretypes.h tree-affine.h \

View File

@ -1,3 +1,7 @@
2007-05-29 Zdenek Dvorak <dvorakz@suse.cz>
* gcc.dg/tree-ssa/prefetch-6.c: New test.
2007-05-29 Tobias Schlüter <tobi@gcc.gnu.org>
* gfortran.dg/sizeof.f90: New.

View File

@ -0,0 +1,54 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-require-effective-target ilp32 } */
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse --param simultaneous-prefetches=100 -fdump-tree-aprefetch-details" } */
#define N 1000
#define K 900
double a[N][N];
double test(void)
{
unsigned i, j;
double sum = 0;
/* Here, we should use non-temporal prefetch instruction. */
for (i = 0; i < K; i++)
for (j = 0; j < K; j++)
sum += a[i][j];
/* Here, we should not use non-temporal prefetch instruction, since the
value of a[i+10][j] is reused in L2 cache. */
for (i = 0; i < K; i++)
for (j = 0; j < K; j++)
sum += a[i][j] * a[i + 10][j];
/* Here, we should use non-temporal prefetch instruction, since the
value of a[i+100][j] is too far to be reused in L2 cache. */
for (i = 0; i < K; i++)
for (j = 0; j < K; j++)
sum += a[i][j] * a[i + 100][j];
/* Here, temporal prefetches should be used, since the volume of the
memory accesses is smaller than L2 cache. */
for (i = 0; i < 100; i++)
for (j = 0; j < 100; j++)
sum += a[i][j] * a[i + 100][j];
/* Temporal prefetches should be used here (even though the accesses to
a[j][i] are independent, the same cache line is almost always hit
every N iterations). */
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
sum += a[j][i];
return sum;
}
/* { dg-final { scan-tree-dump-times "Issued prefetch" 5 "aprefetch" } } */
/* { dg-final { scan-tree-dump-times "Issued nontemporal prefetch" 3 "aprefetch" } } */
/* { dg-final { scan-assembler-times "prefetcht" 5 } } */
/* { dg-final { scan-assembler-times "prefetchnta" 3 } } */
/* { dg-final { cleanup-tree-dump "aprefetch" } } */

View File

@ -771,7 +771,7 @@ free_data_ref (data_reference_p dr)
data_reference description of MEMREF. NEST is the outermost loop of the
loop nest in that the reference should be analysed. */
static struct data_reference *
struct data_reference *
create_data_ref (struct loop *nest, tree memref, tree stmt, bool is_read)
{
struct data_reference *dr;
@ -3843,7 +3843,7 @@ compute_self_dependence (struct data_dependence_relation *ddr)
COMPUTE_SELF_AND_RR is FALSE, don't compute read-read and self
relations. */
static void
void
compute_all_dependences (VEC (data_reference_p, heap) *datarefs,
VEC (ddr_p, heap) **dependence_relations,
VEC (loop_p, heap) *loop_nest,
@ -4055,7 +4055,7 @@ find_loop_nest_1 (struct loop *loop, VEC (loop_p, heap) **loop_nest)
contain the loops from the outermost to the innermost, as they will
appear in the classic distance vector. */
static bool
bool
find_loop_nest (struct loop *loop, VEC (loop_p, heap) **loop_nest)
{
VEC_safe_push (loop_p, heap, *loop_nest, loop);

View File

@ -104,7 +104,7 @@ struct data_reference
tree ref;
/* Auxiliary info specific to a pass. */
int aux;
void *aux;
/* True when the data reference is in RHS of a stmt. */
bool is_read;
@ -320,7 +320,10 @@ extern void dump_data_dependence_direction (FILE *,
extern void free_dependence_relation (struct data_dependence_relation *);
extern void free_dependence_relations (VEC (ddr_p, heap) *);
extern void free_data_refs (VEC (data_reference_p, heap) *);
struct data_reference *create_data_ref (struct loop *, tree, tree, bool);
bool find_loop_nest (struct loop *, VEC (loop_p, heap) **);
void compute_all_dependences (VEC (data_reference_p, heap) *,
VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool);
/* Return the index of the variable VAR in the LOOP_NEST array. */

View File

@ -700,7 +700,7 @@ split_data_refs_to_components (struct loop *loop,
just fail. */
goto end;
}
dr->aux = i;
dr->aux = (void *) (size_t) i;
comp_father[i] = i;
comp_size[i] = 1;
}
@ -715,7 +715,7 @@ split_data_refs_to_components (struct loop *loop,
if (!suitable_reference_p (dr, &dummy))
{
ia = dr->aux;
ia = (unsigned) (size_t) dr->aux;
merge_comps (comp_father, comp_size, n, ia);
}
}
@ -729,8 +729,8 @@ split_data_refs_to_components (struct loop *loop,
dra = DDR_A (ddr);
drb = DDR_B (ddr);
ia = component_of (comp_father, dra->aux);
ib = component_of (comp_father, drb->aux);
ia = component_of (comp_father, (unsigned) (size_t) dra->aux);
ib = component_of (comp_father, (unsigned) (size_t) drb->aux);
if (ia == ib)
continue;
@ -749,7 +749,7 @@ split_data_refs_to_components (struct loop *loop,
bad = component_of (comp_father, n);
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
{
ia = dr->aux;
ia = (unsigned) (size_t) dr->aux;
ca = component_of (comp_father, ia);
if (ca == bad)
continue;

View File

@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#include "params.h"
#include "langhooks.h"
#include "tree-inline.h"
#include "tree-data-ref.h"
/* This pass inserts prefetch instructions to optimize cache usage during
accesses to arrays in loops. It processes loops sequentially and:
@ -82,6 +83,10 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
7/32.
(5) has PREFETCH_MOD 1 as well.
Additionally, we use data dependence analysis to determine for each
reference the distance till the first reuse; this information is used
to determine the temporality of the issued prefetch instruction.
3) We determine how much ahead we need to prefetch. The number of
iterations needed is time to fetch / time spent in one iteration of
the loop. The problem is that we do not know either of these values,
@ -161,6 +166,17 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA
#define HAVE_prefetch 0
#endif
#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * L1_CACHE_LINE_SIZE))
/* TODO: Add parameter to specify L2 cache size. */
#define L2_CACHE_SIZE_BYTES (8 * L1_CACHE_SIZE_BYTES)
/* We consider a memory access nontemporal if it is not reused sooner than
after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
so that we use nontemporal prefetches e.g. if single memory location
is accessed several times in a single iteration of the loop. */
#define NONTEMPORAL_FRACTION 16
/* The group of references between that reuse may occur. */
struct mem_ref_group
@ -190,6 +206,8 @@ struct mem_ref
unsigned HOST_WIDE_INT prefetch_before;
/* Prefetch only first PREFETCH_BEFORE
iterations. */
unsigned reuse_distance; /* The amount of data accessed before the first
reuse of this value. */
bool issue_prefetch_p; /* Should we really issue the prefetch? */
struct mem_ref *next; /* The next reference in the group. */
};
@ -236,7 +254,7 @@ find_or_create_group (struct mem_ref_group **groups, tree base,
break;
}
group = xcalloc (1, sizeof (struct mem_ref_group));
group = XNEW (struct mem_ref_group);
group->base = base;
group->step = step;
group->refs = NULL;
@ -273,13 +291,14 @@ record_ref (struct mem_ref_group *group, tree stmt, tree mem,
return;
}
(*aref) = xcalloc (1, sizeof (struct mem_ref));
(*aref) = XNEW (struct mem_ref);
(*aref)->stmt = stmt;
(*aref)->mem = mem;
(*aref)->delta = delta;
(*aref)->write_p = write_p;
(*aref)->prefetch_before = PREFETCH_ALL;
(*aref)->prefetch_mod = 1;
(*aref)->reuse_distance = 0;
(*aref)->issue_prefetch_p = false;
(*aref)->group = group;
(*aref)->next = NULL;
@ -815,12 +834,15 @@ static void
issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
{
HOST_WIDE_INT delta;
tree addr, addr_base, prefetch, write_p;
tree addr, addr_base, prefetch, write_p, local;
block_stmt_iterator bsi;
unsigned n_prefetches, ap;
bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
fprintf (dump_file, "Issued%s prefetch for %p.\n",
nontemporal ? " nontemporal" : "",
(void *) ref);
bsi = bsi_for_stmt (ref->stmt);
@ -829,6 +851,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
write_p = ref->write_p ? integer_one_node : integer_zero_node;
local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
for (ap = 0; ap < n_prefetches; ap++)
{
@ -840,7 +863,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
/* Create the prefetch instruction. */
prefetch = build_call_expr (built_in_decls[BUILT_IN_PREFETCH],
2, addr, write_p);
3, addr, write_p, local);
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
}
}
@ -935,6 +958,311 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
return factor;
}
/* Returns the total volume of the memory references REFS, taking into account
reuses in the innermost loop and cache line size. TODO -- we should also
take into account reuses across the iterations of the loops in the loop
nest. */
static unsigned
volume_of_references (struct mem_ref_group *refs)
{
unsigned volume = 0;
struct mem_ref_group *gr;
struct mem_ref *ref;
for (gr = refs; gr; gr = gr->next)
for (ref = gr->refs; ref; ref = ref->next)
{
/* Almost always reuses another value? */
if (ref->prefetch_before != PREFETCH_ALL)
continue;
/* If several iterations access the same cache line, use the size of
the line divided by this number. Otherwise, a cache line is
accessed in each iteration. TODO -- in the latter case, we should
take the size of the reference into account, rounding it up on cache
line size multiple. */
volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
}
return volume;
}
/* Returns the volume of memory references accessed across VEC iterations of
loops, whose sizes are described in the LOOP_SIZES array. N is the number
of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
static unsigned
volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
{
unsigned i;
for (i = 0; i < n; i++)
if (vec[i] != 0)
break;
if (i == n)
return 0;
gcc_assert (vec[i] > 0);
/* We ignore the parts of the distance vector in subloops, since usually
the numbers of iterations are much smaller. */
return loop_sizes[i] * vec[i];
}
/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
at the position corresponding to the loop of the step. N is the depth
of the considered loop nest, and, LOOP is its innermost loop. */
static void
add_subscript_strides (tree access_fn, unsigned stride,
HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
{
struct loop *aloop;
tree step;
HOST_WIDE_INT astep;
unsigned min_depth = loop_depth (loop) - n;
while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
{
aloop = get_chrec_loop (access_fn);
step = CHREC_RIGHT (access_fn);
access_fn = CHREC_LEFT (access_fn);
if ((unsigned) loop_depth (aloop) <= min_depth)
continue;
if (host_integerp (step, 0))
astep = tree_low_cst (step, 0);
else
astep = L1_CACHE_LINE_SIZE;
strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
}
}
/* Returns the volume of memory references accessed between two consecutive
self-reuses of the reference DR. We consider the subscripts of DR in N
loops, and LOOP_SIZES contains the volumes of accesses in each of the
loops. LOOP is the innermost loop of the current loop nest. */
static unsigned
self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
struct loop *loop)
{
tree stride, access_fn;
HOST_WIDE_INT *strides, astride;
VEC (tree, heap) *access_fns;
tree ref = DR_REF (dr);
unsigned i, ret = ~0u;
/* In the following example:
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
use (a[j][i]);
the same cache line is accessed each N steps (except if the change from
i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
we cannot rely purely on the results of the data dependence analysis.
Instead, we compute the stride of the reference in each loop, and consider
the innermost loop in that the stride is less than cache size. */
strides = XCNEWVEC (HOST_WIDE_INT, n);
access_fns = DR_ACCESS_FNS (dr);
for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
{
/* Keep track of the reference corresponding to the subscript, so that we
know its stride. */
while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
ref = TREE_OPERAND (ref, 0);
if (TREE_CODE (ref) == ARRAY_REF)
{
stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
if (host_integerp (stride, 1))
astride = tree_low_cst (stride, 1);
else
astride = L1_CACHE_LINE_SIZE;
ref = TREE_OPERAND (ref, 0);
}
else
astride = 1;
add_subscript_strides (access_fn, astride, strides, n, loop);
}
for (i = n; i-- > 0; )
{
unsigned HOST_WIDE_INT s;
s = strides[i] < 0 ? -strides[i] : strides[i];
if (s < (unsigned) L1_CACHE_LINE_SIZE
&& (loop_sizes[i]
> (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
{
ret = loop_sizes[i];
break;
}
}
free (strides);
return ret;
}
/* Determines the distance till the first reuse of each reference in REFS
in the loop nest of LOOP. */
static void
determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs)
{
struct loop *nest, *aloop;
VEC (data_reference_p, heap) *datarefs = NULL;
VEC (ddr_p, heap) *dependences = NULL;
struct mem_ref_group *gr;
struct mem_ref *ref;
VEC (loop_p, heap) *vloops = NULL;
unsigned *loop_data_size;
unsigned i, j, n;
unsigned volume, dist, adist;
HOST_WIDE_INT vol;
data_reference_p dr;
ddr_p dep;
if (loop->inner)
return;
/* Find the outermost loop of the loop nest of loop (we require that
there are no sibling loops inside the nest). */
nest = loop;
while (1)
{
aloop = loop_outer (nest);
if (aloop == current_loops->tree_root
|| aloop->inner->next)
break;
nest = aloop;
}
/* For each loop, determine the amount of data accessed in each iteration.
We use this to estimate whether the reference is evicted from the
cache before its reuse. */
find_loop_nest (nest, &vloops);
n = VEC_length (loop_p, vloops);
loop_data_size = XNEWVEC (unsigned, n);
volume = volume_of_references (refs);
i = n;
while (i-- != 0)
{
loop_data_size[i] = volume;
/* Bound the volume by the L2 cache size, since above this bound,
all dependence distances are equivalent. */
if (volume > L2_CACHE_SIZE_BYTES)
continue;
aloop = VEC_index (loop_p, vloops, i);
vol = estimated_loop_iterations_int (aloop, false);
if (vol < 0)
vol = expected_loop_iterations (aloop);
volume *= vol;
}
/* Prepare the references in the form suitable for data dependence
analysis. We ignore unanalysable data references (the results
are used just as a heuristics to estimate temporality of the
references, hence we do not need to worry about correctness). */
for (gr = refs; gr; gr = gr->next)
for (ref = gr->refs; ref; ref = ref->next)
{
dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
if (dr)
{
ref->reuse_distance = volume;
dr->aux = ref;
VEC_safe_push (data_reference_p, heap, datarefs, dr);
}
}
for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
{
dist = self_reuse_distance (dr, loop_data_size, n, loop);
ref = dr->aux;
if (ref->reuse_distance > dist)
ref->reuse_distance = dist;
}
compute_all_dependences (datarefs, &dependences, vloops, true);
for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
{
if (DDR_ARE_DEPENDENT (dep) == chrec_known)
continue;
if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
|| DDR_NUM_DIST_VECTS (dep) == 0)
{
/* If the dependence cannot be analysed, assume that there might be
a reuse. */
dist = 0;
}
else
{
/* The distance vectors are normalised to be always lexicographically
positive, hence we cannot tell just from them whether DDR_A comes
before DDR_B or vice versa. However, it is not important,
anyway -- if DDR_A is close to DDR_B, then it is either reused in
DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
in cache (and marking it as nontemporal would not affect
anything). */
dist = volume;
for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
{
adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
loop_data_size, n);
/* Ignore accesses closer than
L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
so that we use nontemporal prefetches e.g. if single memory
location is accessed several times in a single iteration of
the loop. */
if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
continue;
if (adist < dist)
dist = adist;
}
}
ref = DDR_A (dep)->aux;
if (ref->reuse_distance > dist)
ref->reuse_distance = dist;
ref = DDR_B (dep)->aux;
if (ref->reuse_distance > dist)
ref->reuse_distance = dist;
}
free_dependence_relations (dependences);
free_data_refs (datarefs);
free (loop_data_size);
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Reuse distances:\n");
for (gr = refs; gr; gr = gr->next)
for (ref = gr->refs; ref; ref = ref->next)
fprintf (dump_file, " ref %p distance %u\n",
(void *) ref, ref->reuse_distance);
}
}
/* Issue prefetch instructions for array references in LOOP. Returns
true if the LOOP was unrolled. */
@ -963,6 +1291,8 @@ loop_prefetch_arrays (struct loop *loop)
if (!anything_to_prefetch_p (refs))
goto fail;
determine_loop_nest_reuse (loop, refs);
/* Step 3: determine the ahead and unroll factor. */
/* FIXME: the time should be weighted by the probabilities of the blocks in
@ -1034,10 +1364,11 @@ tree_ssa_prefetch_arrays (void)
fprintf (dump_file, " simultaneous prefetches: %d\n",
SIMULTANEOUS_PREFETCHES);
fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
fprintf (dump_file, " L1 cache size: %d (%d bytes)\n",
L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE);
fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
fprintf (dump_file, " L1 cache size: %d lines, %d bytes\n",
L1_CACHE_SIZE, L1_CACHE_SIZE_BYTES);
fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
fprintf (dump_file, " L2 cache size: %d bytes\n", L2_CACHE_SIZE_BYTES);
fprintf (dump_file, "\n");
}

View File

@ -1128,7 +1128,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
fprintf (vect_dump, "vect_compute_data_ref_alignment:");
/* Initialize misalignment to unknown. */
DR_MISALIGNMENT (dr) = -1;
SET_DR_MISALIGNMENT (dr, -1);
misalign = DR_INIT (dr);
aligned_to = DR_ALIGNED_TO (dr);
@ -1198,7 +1198,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
return false;
}
DR_MISALIGNMENT (dr) = TREE_INT_CST_LOW (misalign);
SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
if (vect_print_dump_info (REPORT_DETAILS))
{
@ -1267,21 +1267,23 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
continue;
gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
DR_MISALIGNMENT (dr_peel) / dr_peel_size);
DR_MISALIGNMENT (dr) = 0;
SET_DR_MISALIGNMENT (dr, 0);
return;
}
if (known_alignment_for_access_p (dr)
&& known_alignment_for_access_p (dr_peel))
{
DR_MISALIGNMENT (dr) += npeel * dr_size;
DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD;
int misal = DR_MISALIGNMENT (dr);
misal += npeel * dr_size;
misal %= UNITS_PER_SIMD_WORD;
SET_DR_MISALIGNMENT (dr, misal);
return;
}
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "Setting misalignment to -1.");
DR_MISALIGNMENT (dr) = -1;
SET_DR_MISALIGNMENT (dr, -1);
}
@ -1577,7 +1579,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
save_misalignment = DR_MISALIGNMENT (dr);
vect_update_misalignment_for_peel (dr, dr0, npeel);
supportable_dr_alignment = vect_supportable_dr_alignment (dr);
DR_MISALIGNMENT (dr) = save_misalignment;
SET_DR_MISALIGNMENT (dr, save_misalignment);
if (!supportable_dr_alignment)
{
@ -1601,7 +1603,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
DR_MISALIGNMENT (dr0) = 0;
SET_DR_MISALIGNMENT (dr0, 0);
if (vect_print_dump_info (REPORT_ALIGNMENT))
fprintf (vect_dump, "Alignment of access forced using peeling.");
@ -1702,7 +1704,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
dr = STMT_VINFO_DATA_REF (stmt_info);
DR_MISALIGNMENT (dr) = 0;
SET_DR_MISALIGNMENT (dr, 0);
if (vect_print_dump_info (REPORT_ALIGNMENT))
fprintf (vect_dump, "Alignment of access forced using versioning.");
}

View File

@ -339,7 +339,8 @@ is_pattern_stmt_p (stmt_vec_info stmt_info)
/* Reflects actual alignment of first access in the vectorized loop,
taking into account peeling/versioning if applied. */
#define DR_MISALIGNMENT(DR) (DR)->aux
#define DR_MISALIGNMENT(DR) ((int) (size_t) (DR)->aux)
#define SET_DR_MISALIGNMENT(DR, VAL) ((DR)->aux = (void *) (size_t) (VAL))
static inline bool
aligned_access_p (struct data_reference *data_ref_info)