tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve comments.

* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
	comments.
	(issue_prefetch_ref): Move assignment to write_p out of loop.
	(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
	SIMULTANEOUS_PREFETCHES into account.
	(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
	* lambda-code.c (lcm): Renamed to ...
	(least_common_multiple): ... and exported.
	* tree-flow.h (least_common_multiple): Declare.

From-SVN: r118730
This commit is contained in:
Zdenek Dvorak 2006-11-12 21:05:49 +01:00 committed by Zdenek Dvorak
parent 946e1bc757
commit 911b3fdbe3
6 changed files with 94 additions and 56 deletions

View File

@ -1,3 +1,15 @@
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* tree-ssa-loop-prefetch.c (schedule_prefetches): Cleanup and improve
comments.
(issue_prefetch_ref): Move assignment to write_p out of loop.
(determine_unroll_factor): Do not take PARAM_MAX_UNROLL_TIMES and
SIMULTANEOUS_PREFETCHES into account.
(loop_prefetch_arrays): Do not pass ahead to determine_unroll_factor.
* lambda-code.c (lcm): Renamed to ...
(least_common_multiple): ... and exported.
* tree-flow.h (least_common_multiple): Declare.
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* Makefile.in (tree-data-ref.o): Add langhooks.h dependency.

View File

@ -442,8 +442,8 @@ lambda_lattice_compute_base (lambda_loopnest nest)
/* Compute the least common multiple of two numbers A and B . */
static int
lcm (int a, int b)
int
least_common_multiple (int a, int b)
{
return (abs (a) * abs (b) / gcd (a, b));
}
@ -577,7 +577,7 @@ compute_nest_using_fourier_motzkin (int size,
{
if (A[k][i] < 0)
{
multiple = lcm (A[j][i], A[k][i]);
multiple = least_common_multiple (A[j][i], A[k][i]);
f1 = multiple / A[j][i];
f2 = -1 * multiple / A[k][i];

View File

@ -1,3 +1,7 @@
2006-11-12 Zdenek Dvorak <dvorakz@suse.cz>
* gcc.dg/tree-ssa/prefetch-3.c: New test.
2006-11-12 Roger Sayle <roger@eyesopen.com>
PR tree-optimization/13827

View File

@ -0,0 +1,22 @@
/* Prefetching used to prefer nonsensical unroll factor of 5 in this testcase. */
/* { dg-do compile { target i?86-*-* } } */
/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse -fdump-tree-aprefetch-details" } */
#define N 1000000
double a[N];
double test(void)
{
unsigned i;
double sum = 0;
for (i = 0; i < N; i += 2)
sum += (a[i] * a[i+1]);
return sum;
}
/* { dg-final { scan-tree-dump-times "unroll factor 4" 1 "aprefetch" } } */
/* { dg-final { cleanup-tree-dump "aprefetch" } } */

View File

@ -1036,4 +1036,7 @@ void swap_tree_operands (tree, tree *, tree *);
extern void recalculate_used_alone (void);
extern bool updating_used_alone;
int least_common_multiple (int, int);
#endif /* _TREE_FLOW_H */

View File

@ -744,19 +744,21 @@ static bool
schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
unsigned ahead)
{
unsigned max_prefetches, n_prefetches;
unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
unsigned slots_per_prefetch;
struct mem_ref *ref;
bool any = false;
max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
max_prefetches = SIMULTANEOUS_PREFETCHES;
/* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
/* The prefetch will run for AHEAD iterations of the original loop, i.e.,
AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
it will need a prefetch slot. */
slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
if (!max_prefetches)
return false;
fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
slots_per_prefetch);
/* For now we just take memory references one by one and issue
prefetches for as many as possible. The groups are sorted
@ -769,16 +771,24 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
if (!should_issue_prefetch_p (ref))
continue;
ref->issue_prefetch_p = true;
/* If prefetch_mod is less then unroll_factor, we need to insert
several prefetches for the reference. */
/* If we need to prefetch the reference each PREFETCH_MOD iterations,
and we unroll the loop UNROLL_FACTOR times, we need to insert
ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
iteration. */
n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
/ ref->prefetch_mod);
if (max_prefetches <= n_prefetches)
return true;
prefetch_slots = n_prefetches * slots_per_prefetch;
max_prefetches -= n_prefetches;
/* If more than half of the prefetches would be lost anyway, do not
issue the prefetch. */
if (2 * remaining_prefetch_slots < prefetch_slots)
continue;
ref->issue_prefetch_p = true;
if (remaining_prefetch_slots <= prefetch_slots)
return true;
remaining_prefetch_slots -= prefetch_slots;
any = true;
}
@ -822,6 +832,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
/ ref->prefetch_mod);
addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
write_p = ref->write_p ? integer_one_node : integer_zero_node;
for (ap = 0; ap < n_prefetches; ap++)
{
@ -832,10 +843,9 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
/* Create the prefetch instruction. */
write_p = ref->write_p ? integer_one_node : integer_zero_node;
params = tree_cons (NULL_TREE, addr,
tree_cons (NULL_TREE, write_p, NULL_TREE));
prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
params);
bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
@ -888,48 +898,36 @@ should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
static unsigned
determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
unsigned ahead, unsigned ninsns,
struct tree_niter_desc *desc)
unsigned ninsns, struct tree_niter_desc *desc)
{
unsigned upper_bound, size_factor, constraint_factor;
unsigned factor, max_mod_constraint, ahead_factor;
unsigned upper_bound;
unsigned nfactor, factor, mod_constraint;
struct mem_ref_group *agp;
struct mem_ref *ref;
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
/* First check whether the loop is not too large to unroll. */
size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
if (size_factor <= 1)
/* First check whether the loop is not too large to unroll. We ignore
PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
from unrolling them enough to make exactly one cache line covered by each
iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
us from unrolling the loops too many times in cases where we only expect
gains from better scheduling and decreasing loop overhead, which is not
the case here. */
upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
if (upper_bound <= 1)
return 1;
if (size_factor < upper_bound)
upper_bound = size_factor;
max_mod_constraint = 1;
/* Choose the factor so that we may prefetch each cache just once,
but bound the unrolling by UPPER_BOUND. */
factor = 1;
for (agp = refs; agp; agp = agp->next)
for (ref = agp->refs; ref; ref = ref->next)
if (should_issue_prefetch_p (ref)
&& ref->prefetch_mod > max_mod_constraint)
max_mod_constraint = ref->prefetch_mod;
/* Set constraint_factor as large as needed to be able to satisfy the
largest modulo constraint. */
constraint_factor = max_mod_constraint;
/* If ahead is too large in comparison with the number of available
prefetches, unroll the loop as much as needed to be able to prefetch
at least partially some of the references in the loop. */
ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
/ SIMULTANEOUS_PREFETCHES);
/* Unroll as much as useful, but bound the code size growth. */
if (constraint_factor < ahead_factor)
factor = ahead_factor;
else
factor = constraint_factor;
if (factor > upper_bound)
factor = upper_bound;
if (should_issue_prefetch_p (ref))
{
mod_constraint = ref->prefetch_mod;
nfactor = least_common_multiple (mod_constraint, factor);
if (nfactor <= upper_bound)
factor = nfactor;
}
if (!should_unroll_loop_p (loop, desc, factor))
return 1;
@ -964,8 +962,7 @@ loop_prefetch_arrays (struct loops *loops, struct loop *loop)
instructions executed per iteration of the loop. */
ninsns = tree_num_loop_insns (loop);
ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
&desc);
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);