2009-06-08 Ghassan Shobaki <ghassan.shobaki@amd.com>

Dwarakanath Rajagopal  <dwarak.rajagopal@amd.com>
        
        * tree-ssa-loop-prefetch.c 
        (gather_memory_references): Introduced a counter for the number of 
        memory references.
        (anything_to_prefetch_p): Introduced a counter for the number of 
        prefetches.
        (is_loop_prefetching_profitable): New function with a cost model 
        for prefetching.
        (loop_prefetch_arrays): Use the new cost model to determine if 
        prefetching is profitable.
        * params.def (MIN_INSN_TO_PREFETCH_RATIO, 
        PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.
        * params.h (MIN_INSN_TO_PREFETCH_RATIO, 
        PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.
        * doc/invoke.texi (MIN_INSN_TO_PREFETCH_RATIO, 
        PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.


Co-Authored-By: Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>

From-SVN: r148277
This commit is contained in:
Ghassan Shobaki 2009-06-08 16:00:13 +00:00 committed by Dwarakanath Rajagopal
parent b01630bb3d
commit db34470d17
5 changed files with 160 additions and 26 deletions

View File

@ -1,3 +1,22 @@
2009-06-08 Ghassan Shobaki <ghassan.shobaki@amd.com>
Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
* tree-ssa-loop-prefetch.c
(gather_memory_references): Introduced a counter for the number of
memory references.
(anything_to_prefetch_p): Introduced a counter for the number of
prefetches.
(is_loop_prefetching_profitable): New function with a cost model
for prefetching.
(loop_prefetch_arrays): Use the new cost model to determine if
prefetching is profitable.
* params.def (MIN_INSN_TO_PREFETCH_RATIO,
PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.
* params.h (MIN_INSN_TO_PREFETCH_RATIO,
PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.
* doc/invoke.texi (MIN_INSN_TO_PREFETCH_RATIO,
PREFETCH_MIN_INSN_TO_MEM_RATIO): New parameters.
2009-06-08 Michael Matz <matz@suse.de>
PR debug/40012

View File

@ -7914,6 +7914,15 @@ The size of L1 cache, in kilobytes.
@item l2-cache-size
The size of L2 cache, in kilobytes.
@item min-insn-to-prefetch-ratio
The minimum ratio between the number of instructions and the
number of prefetches to enable prefetching in a loop with an
unknown trip count.
@item prefetch-min-insn-to-mem-ratio
The minimum ratio between the number of instructions and the
number of memory references to enable prefetching in a loop.
@item use-canonical-types
Whether the compiler should use the ``canonical'' type system. By
default, this should always be 1, which uses a more efficient internal

View File

@ -741,6 +741,17 @@ DEFPARAM (PARAM_SLP_MAX_INSNS_IN_BB,
"Maximum number of instructions in basic block to be considered for SLP vectorization",
1000, 0, 0)
DEFPARAM (PARAM_MIN_INSN_TO_PREFETCH_RATIO,
"min-insn-to-prefetch-ratio",
"min. ratio of insns to prefetches to enable prefetching for "
"a loop with an unknown trip count",
10, 0, 0)
DEFPARAM (PARAM_PREFETCH_MIN_INSN_TO_MEM_RATIO,
"prefetch-min-insn-to-mem-ratio",
"min. ratio of insns to mem ops to enable prefetching in a loop",
3, 0, 0)
/*
Local variables:
mode:c

View File

@ -166,4 +166,8 @@ typedef enum compiler_param
PARAM_VALUE (PARAM_LOOP_INVARIANT_MAX_BBS_IN_LOOP)
#define SLP_MAX_INSNS_IN_BB \
PARAM_VALUE (PARAM_SLP_MAX_INSNS_IN_BB)
#define MIN_INSN_TO_PREFETCH_RATIO \
PARAM_VALUE (PARAM_MIN_INSN_TO_PREFETCH_RATIO)
#define PREFETCH_MIN_INSN_TO_MEM_RATIO \
PARAM_VALUE (PARAM_PREFETCH_MIN_INSN_TO_MEM_RATIO)
#endif /* ! GCC_PARAMS_H */

View File

@ -109,6 +109,23 @@ along with GCC; see the file COPYING3. If not see
prefetch instructions with guards in cases where 5) was not sufficient
to satisfy the constraints?
The function is_loop_prefetching_profitable() implements a cost model
to determine if prefetching is profitable for a given loop. The cost
model has two heuristcs:
1. A heuristic that determines whether the given loop has enough CPU
ops that can be overlapped with cache missing memory ops.
If not, the loop won't benefit from prefetching. This is implemented
by requirung the ratio between the instruction count and the mem ref
count to be above a certain minimum.
2. A heuristic that disables prefetching in a loop with an unknown trip
count if the prefetching cost is above a certain limit. The relative
prefetching cost is estimated by taking the ratio between the
prefetch count and the total intruction count (this models the I-cache
cost).
The limits used in these heuristics are defined as parameters with
reasonable default values. Machine-specific default values will be
added later.
Some other TODO:
-- write and use more general reuse analysis (that could be also used
in other cache aimed loop optimizations)
@ -476,7 +493,7 @@ gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
true if there are no other memory references inside the loop. */
static struct mem_ref_group *
gather_memory_references (struct loop *loop, bool *no_other_refs)
gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
{
basic_block *body = get_loop_body_in_dom_order (loop);
basic_block bb;
@ -487,6 +504,7 @@ gather_memory_references (struct loop *loop, bool *no_other_refs)
struct mem_ref_group *refs = NULL;
*no_other_refs = true;
*ref_count = 0;
/* Scan the loop body in order, so that the former references precede the
later ones. */
@ -513,11 +531,17 @@ gather_memory_references (struct loop *loop, bool *no_other_refs)
rhs = gimple_assign_rhs1 (stmt);
if (REFERENCE_CLASS_P (rhs))
{
*no_other_refs &= gather_memory_references_ref (loop, &refs,
rhs, false, stmt);
*ref_count += 1;
}
if (REFERENCE_CLASS_P (lhs))
{
*no_other_refs &= gather_memory_references_ref (loop, &refs,
lhs, true, stmt);
*ref_count += 1;
}
}
}
free (body);
@ -846,20 +870,20 @@ schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
return any;
}
/* Determine whether there is any reference suitable for prefetching
in GROUPS. */
/* Estimate the number of prefetches in the given GROUPS. */
static bool
anything_to_prefetch_p (struct mem_ref_group *groups)
static int
estimate_prefetch_count (struct mem_ref_group *groups)
{
struct mem_ref *ref;
int prefetch_count = 0;
for (; groups; groups = groups->next)
for (ref = groups->refs; ref; ref = ref->next)
if (should_issue_prefetch_p (ref))
return true;
prefetch_count++;
return false;
return prefetch_count;
}
/* Issue prefetches for the reference REF into loop as decided before.
@ -1449,6 +1473,71 @@ determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
}
}
/* Do a cost-benefit analysis to determine if prefetching is profitable
for the current loop given the following parameters:
AHEAD: the iteration ahead distance,
EST_NITER: the estimated trip count,
NINSNS: estimated number of instructions in the loop,
PREFETCH_COUNT: an estimate of the number of prefetches
MEM_REF_COUNT: total number of memory references in the loop. */
static bool
is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
unsigned ninsns, unsigned prefetch_count,
unsigned mem_ref_count)
{
int insn_to_mem_ratio, insn_to_prefetch_ratio;
if (mem_ref_count == 0)
return false;
/* Prefetching improves performance by overlapping cache missing
memory accesses with CPU operations. If the loop does not have
enough CPU operations to overlap with memory operations, prefetching
won't give a significant benefit. One approximate way of checking
this is to require the ratio of instructions to memory references to
be above a certain limit. This approximation works well in practice.
TODO: Implement a more precise computation by estimating the time
for each CPU or memory op in the loop. Time estimates for memory ops
should account for cache misses. */
insn_to_mem_ratio = ninsns / mem_ref_count;
if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
return false;
/* Profitability of prefetching is highly dependent on the trip count.
For a given AHEAD distance, the first AHEAD iterations do not benefit
from prefetching, and the last AHEAD iterations execute useless
prefetches. So, if the trip count is not large enough relative to AHEAD,
prefetching may cause serious performance degradation. To avoid this
problem when the trip count is not known at compile time, we
conservatively skip loops with high prefetching costs. For now, only
the I-cache cost is considered. The relative I-cache cost is estimated
by taking the ratio between the number of prefetches and the total
number of instructions. Since we are using integer arithmetic, we
compute the reciprocal of this ratio.
TODO: Account for loop unrolling, which may reduce the costs of
shorter stride prefetches. Note that not accounting for loop
unrolling over-estimates the cost and hence gives more conservative
results. */
if (est_niter < 0)
{
insn_to_prefetch_ratio = ninsns / prefetch_count;
return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
}
if (est_niter <= (HOST_WIDE_INT) ahead)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Not prefetching -- loop estimated to roll only %d times\n",
(int) est_niter);
return false;
}
return true;
}
/* Issue prefetch instructions for array references in LOOP. Returns
true if the LOOP was unrolled. */
@ -1460,6 +1549,8 @@ loop_prefetch_arrays (struct loop *loop)
HOST_WIDE_INT est_niter;
struct tree_niter_desc desc;
bool unrolled = false, no_other_refs;
unsigned prefetch_count;
unsigned mem_ref_count;
if (optimize_loop_nest_for_size_p (loop))
{
@ -1469,12 +1560,13 @@ loop_prefetch_arrays (struct loop *loop)
}
/* Step 1: gather the memory references. */
refs = gather_memory_references (loop, &no_other_refs);
refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
/* Step 2: estimate the reuse effects. */
prune_by_reuse (refs);
if (!anything_to_prefetch_p (refs))
prefetch_count = estimate_prefetch_count (refs);
if (prefetch_count == 0)
goto fail;
determine_loop_nest_reuse (loop, refs, no_other_refs);
@ -1485,27 +1577,22 @@ loop_prefetch_arrays (struct loop *loop)
the loop body. */
time = tree_num_loop_insns (loop, &eni_time_weights);
ahead = (PREFETCH_LATENCY + time - 1) / time;
est_niter = estimated_loop_iterations_int (loop, false);
/* The prefetches will run for AHEAD iterations of the original loop. Unless
the loop rolls at least AHEAD times, prefetching the references does not
make sense. */
if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead)
{
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Not prefetching -- loop estimated to roll only %d times\n",
(int) est_niter);
goto fail;
}
mark_nontemporal_stores (loop, refs);
est_niter = estimated_loop_iterations_int (loop, false);
ninsns = tree_num_loop_insns (loop, &eni_size_weights);
unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
est_niter);
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
fprintf (dump_file, "Ahead %d, unroll factor %d, trip count %ld\n"
"insn count %d, mem ref count %d, prefetch count %d\n",
ahead, unroll_factor, est_niter, ninsns, mem_ref_count,
prefetch_count);
if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns,
prefetch_count, mem_ref_count))
goto fail;
mark_nontemporal_stores (loop, refs);
/* Step 4: what to prefetch? */
if (!schedule_prefetches (refs, unroll_factor, ahead))
@ -1556,7 +1643,11 @@ tree_ssa_prefetch_arrays (void)
fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
MIN_INSN_TO_PREFETCH_RATIO);
fprintf (dump_file, " min insn-to-mem ratio: %d \n",
PREFETCH_MIN_INSN_TO_MEM_RATIO);
fprintf (dump_file, "\n");
}