diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 3dec250c4b4..9c7429514df 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,9 +1,37 @@ +2007-05-29 Zdenek Dvorak + + * tree-vectorizer.h (DR_MISALIGNMENT): Cast aux to integer. + (SET_DR_MISALIGNMENT): New. + * tree-vect-analyze.c (vect_compute_data_ref_alignment, + vect_update_misalignment_for_peel, vect_enhance_data_refs_alignment): + Use SET_DR_MISALIGNMENT. + * tree-predcom.c (split_data_refs_to_components): Cast dr->aux from + pointer. + * tree-data-ref.c (create_data_ref, compute_all_dependences, + find_loop_nest): Export. + * tree-data-ref.h (struct data_reference): Change aux field to pointer. + (create_data_ref, compute_all_dependences, find_loop_nest): Declare. + * tree-ssa-loop-prefetch.c: Include tree-data-ref.h. + (L1_CACHE_SIZE_BYTES, L2_CACHE_SIZE_BYTES, NONTEMPORAL_FRACTION): + New macros. + (struct mem_ref): Add field reuse_distance. + (find_or_create_group, record_ref): Use XNEW instead of xcalloc. + Initialize reuse_distance field. + (issue_prefetch_ref): Select temporality of prefetch according to + reuse_distance. + (volume_of_references, volume_of_dist_vector, add_subscript_strides, + self_reuse_distance, determine_loop_nest_reuse): New functions. + (loop_prefetch_arrays): Call determine_loop_nest_reuse. + (tree_ssa_prefetch_arrays): Dump L2 cache size. + * Makefile.in (tree-ssa-loop-prefetch.o): Add TREE_DATA_REF_H + dependency. + 2007-05-29 Daniel Berlin * tree-ssa-alias.c: Add aliasing overview. 2007-05-29 Zuxy Meng - Danny Smith + Danny Smith PR target/29498 * config/i386/t-crtfm: Compile crtfastmath.o with diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 8064b2f01fa..605a0bdff4d 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -2084,7 +2084,7 @@ tree-ssa-loop-prefetch.o: tree-ssa-loop-prefetch.c $(TREE_FLOW_H) $(CONFIG_H) \ output.h $(DIAGNOSTIC_H) $(TIMEVAR_H) $(TM_H) coretypes.h $(TREE_DUMP_H) \ tree-pass.h $(GGC_H) $(RECOG_H) insn-config.h $(HASHTAB_H) $(SCEV_H) \ $(CFGLOOP_H) $(PARAMS_H) langhooks.h $(BASIC_BLOCK_H) hard-reg-set.h \ - tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H) + tree-chrec.h toplev.h langhooks.h $(TREE_INLINE_H) $(TREE_DATA_REF_H) tree-predcom.o: tree-predcom.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) $(TM_P_H) \ $(CFGLOOP_H) $(TREE_FLOW_H) $(GGC_H) $(TREE_DATA_REF_H) $(SCEV_H) \ $(PARAMS_H) $(DIAGNOSTIC_H) tree-pass.h $(TM_H) coretypes.h tree-affine.h \ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 696a47840de..6340ca4036e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2007-05-29 Zdenek Dvorak + + * gcc.dg/tree-ssa/prefetch-6.c: New test. + 2007-05-29 Tobias Schlüter * gfortran.dg/sizeof.f90: New. diff --git a/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c new file mode 100644 index 00000000000..44f65bffb20 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/prefetch-6.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -fprefetch-loop-arrays -march=athlon -msse2 -mfpmath=sse --param simultaneous-prefetches=100 -fdump-tree-aprefetch-details" } */ + +#define N 1000 +#define K 900 + +double a[N][N]; + +double test(void) +{ + unsigned i, j; + double sum = 0; + + /* Here, we should use non-temporal prefetch instruction. */ + for (i = 0; i < K; i++) + for (j = 0; j < K; j++) + sum += a[i][j]; + + /* Here, we should not use non-temporal prefetch instruction, since the + value of a[i+10][j] is reused in L2 cache. */ + for (i = 0; i < K; i++) + for (j = 0; j < K; j++) + sum += a[i][j] * a[i + 10][j]; + + /* Here, we should use non-temporal prefetch instruction, since the + value of a[i+100][j] is too far to be reused in L2 cache. */ + for (i = 0; i < K; i++) + for (j = 0; j < K; j++) + sum += a[i][j] * a[i + 100][j]; + + /* Here, temporal prefetches should be used, since the volume of the + memory accesses is smaller than L2 cache. */ + for (i = 0; i < 100; i++) + for (j = 0; j < 100; j++) + sum += a[i][j] * a[i + 100][j]; + + /* Temporal prefetches should be used here (even though the accesses to + a[j][i] are independent, the same cache line is almost always hit + every N iterations). */ + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + sum += a[j][i]; + + return sum; +} + +/* { dg-final { scan-tree-dump-times "Issued prefetch" 5 "aprefetch" } } */ +/* { dg-final { scan-tree-dump-times "Issued nontemporal prefetch" 3 "aprefetch" } } */ + +/* { dg-final { scan-assembler-times "prefetcht" 5 } } */ +/* { dg-final { scan-assembler-times "prefetchnta" 3 } } */ + +/* { dg-final { cleanup-tree-dump "aprefetch" } } */ diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index ef3353fccb1..99f8fd2b91f 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -771,7 +771,7 @@ free_data_ref (data_reference_p dr) data_reference description of MEMREF. NEST is the outermost loop of the loop nest in that the reference should be analysed. */ -static struct data_reference * +struct data_reference * create_data_ref (struct loop *nest, tree memref, tree stmt, bool is_read) { struct data_reference *dr; @@ -3843,7 +3843,7 @@ compute_self_dependence (struct data_dependence_relation *ddr) COMPUTE_SELF_AND_RR is FALSE, don't compute read-read and self relations. */ -static void +void compute_all_dependences (VEC (data_reference_p, heap) *datarefs, VEC (ddr_p, heap) **dependence_relations, VEC (loop_p, heap) *loop_nest, @@ -4055,7 +4055,7 @@ find_loop_nest_1 (struct loop *loop, VEC (loop_p, heap) **loop_nest) contain the loops from the outermost to the innermost, as they will appear in the classic distance vector. */ -static bool +bool find_loop_nest (struct loop *loop, VEC (loop_p, heap) **loop_nest) { VEC_safe_push (loop_p, heap, *loop_nest, loop); diff --git a/gcc/tree-data-ref.h b/gcc/tree-data-ref.h index b53c6f17bfd..ddff36a0ec0 100644 --- a/gcc/tree-data-ref.h +++ b/gcc/tree-data-ref.h @@ -104,7 +104,7 @@ struct data_reference tree ref; /* Auxiliary info specific to a pass. */ - int aux; + void *aux; /* True when the data reference is in RHS of a stmt. */ bool is_read; @@ -320,7 +320,10 @@ extern void dump_data_dependence_direction (FILE *, extern void free_dependence_relation (struct data_dependence_relation *); extern void free_dependence_relations (VEC (ddr_p, heap) *); extern void free_data_refs (VEC (data_reference_p, heap) *); - +struct data_reference *create_data_ref (struct loop *, tree, tree, bool); +bool find_loop_nest (struct loop *, VEC (loop_p, heap) **); +void compute_all_dependences (VEC (data_reference_p, heap) *, + VEC (ddr_p, heap) **, VEC (loop_p, heap) *, bool); /* Return the index of the variable VAR in the LOOP_NEST array. */ diff --git a/gcc/tree-predcom.c b/gcc/tree-predcom.c index bae23ed5272..a3a137f7d28 100644 --- a/gcc/tree-predcom.c +++ b/gcc/tree-predcom.c @@ -700,7 +700,7 @@ split_data_refs_to_components (struct loop *loop, just fail. */ goto end; } - dr->aux = i; + dr->aux = (void *) (size_t) i; comp_father[i] = i; comp_size[i] = 1; } @@ -715,7 +715,7 @@ split_data_refs_to_components (struct loop *loop, if (!suitable_reference_p (dr, &dummy)) { - ia = dr->aux; + ia = (unsigned) (size_t) dr->aux; merge_comps (comp_father, comp_size, n, ia); } } @@ -729,8 +729,8 @@ split_data_refs_to_components (struct loop *loop, dra = DDR_A (ddr); drb = DDR_B (ddr); - ia = component_of (comp_father, dra->aux); - ib = component_of (comp_father, drb->aux); + ia = component_of (comp_father, (unsigned) (size_t) dra->aux); + ib = component_of (comp_father, (unsigned) (size_t) drb->aux); if (ia == ib) continue; @@ -749,7 +749,7 @@ split_data_refs_to_components (struct loop *loop, bad = component_of (comp_father, n); for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) { - ia = dr->aux; + ia = (unsigned) (size_t) dr->aux; ca = component_of (comp_father, ia); if (ca == bad) continue; diff --git a/gcc/tree-ssa-loop-prefetch.c b/gcc/tree-ssa-loop-prefetch.c index 35e80218cb2..3159748c672 100644 --- a/gcc/tree-ssa-loop-prefetch.c +++ b/gcc/tree-ssa-loop-prefetch.c @@ -46,6 +46,7 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA #include "params.h" #include "langhooks.h" #include "tree-inline.h" +#include "tree-data-ref.h" /* This pass inserts prefetch instructions to optimize cache usage during accesses to arrays in loops. It processes loops sequentially and: @@ -82,6 +83,10 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA 7/32. (5) has PREFETCH_MOD 1 as well. + Additionally, we use data dependence analysis to determine for each + reference the distance till the first reuse; this information is used + to determine the temporality of the issued prefetch instruction. + 3) We determine how much ahead we need to prefetch. The number of iterations needed is time to fetch / time spent in one iteration of the loop. The problem is that we do not know either of these values, @@ -161,6 +166,17 @@ Software Foundation, 59 Temple Place - Suite 330, Boston, MA #define HAVE_prefetch 0 #endif +#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * L1_CACHE_LINE_SIZE)) +/* TODO: Add parameter to specify L2 cache size. */ +#define L2_CACHE_SIZE_BYTES (8 * L1_CACHE_SIZE_BYTES) + +/* We consider a memory access nontemporal if it is not reused sooner than + after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore + accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, + so that we use nontemporal prefetches e.g. if single memory location + is accessed several times in a single iteration of the loop. */ +#define NONTEMPORAL_FRACTION 16 + /* The group of references between that reuse may occur. */ struct mem_ref_group @@ -190,6 +206,8 @@ struct mem_ref unsigned HOST_WIDE_INT prefetch_before; /* Prefetch only first PREFETCH_BEFORE iterations. */ + unsigned reuse_distance; /* The amount of data accessed before the first + reuse of this value. */ bool issue_prefetch_p; /* Should we really issue the prefetch? */ struct mem_ref *next; /* The next reference in the group. */ }; @@ -236,7 +254,7 @@ find_or_create_group (struct mem_ref_group **groups, tree base, break; } - group = xcalloc (1, sizeof (struct mem_ref_group)); + group = XNEW (struct mem_ref_group); group->base = base; group->step = step; group->refs = NULL; @@ -273,13 +291,14 @@ record_ref (struct mem_ref_group *group, tree stmt, tree mem, return; } - (*aref) = xcalloc (1, sizeof (struct mem_ref)); + (*aref) = XNEW (struct mem_ref); (*aref)->stmt = stmt; (*aref)->mem = mem; (*aref)->delta = delta; (*aref)->write_p = write_p; (*aref)->prefetch_before = PREFETCH_ALL; (*aref)->prefetch_mod = 1; + (*aref)->reuse_distance = 0; (*aref)->issue_prefetch_p = false; (*aref)->group = group; (*aref)->next = NULL; @@ -815,12 +834,15 @@ static void issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) { HOST_WIDE_INT delta; - tree addr, addr_base, prefetch, write_p; + tree addr, addr_base, prefetch, write_p, local; block_stmt_iterator bsi; unsigned n_prefetches, ap; + bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES; if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref); + fprintf (dump_file, "Issued%s prefetch for %p.\n", + nontemporal ? " nontemporal" : "", + (void *) ref); bsi = bsi_for_stmt (ref->stmt); @@ -829,6 +851,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL); write_p = ref->write_p ? integer_one_node : integer_zero_node; + local = build_int_cst (integer_type_node, nontemporal ? 0 : 3); for (ap = 0; ap < n_prefetches; ap++) { @@ -840,7 +863,7 @@ issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) /* Create the prefetch instruction. */ prefetch = build_call_expr (built_in_decls[BUILT_IN_PREFETCH], - 2, addr, write_p); + 3, addr, write_p, local); bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT); } } @@ -935,6 +958,311 @@ determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, return factor; } +/* Returns the total volume of the memory references REFS, taking into account + reuses in the innermost loop and cache line size. TODO -- we should also + take into account reuses across the iterations of the loops in the loop + nest. */ + +static unsigned +volume_of_references (struct mem_ref_group *refs) +{ + unsigned volume = 0; + struct mem_ref_group *gr; + struct mem_ref *ref; + + for (gr = refs; gr; gr = gr->next) + for (ref = gr->refs; ref; ref = ref->next) + { + /* Almost always reuses another value? */ + if (ref->prefetch_before != PREFETCH_ALL) + continue; + + /* If several iterations access the same cache line, use the size of + the line divided by this number. Otherwise, a cache line is + accessed in each iteration. TODO -- in the latter case, we should + take the size of the reference into account, rounding it up on cache + line size multiple. */ + volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod; + } + return volume; +} + +/* Returns the volume of memory references accessed across VEC iterations of + loops, whose sizes are described in the LOOP_SIZES array. N is the number + of the loops in the nest (length of VEC and LOOP_SIZES vectors). */ + +static unsigned +volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n) +{ + unsigned i; + + for (i = 0; i < n; i++) + if (vec[i] != 0) + break; + + if (i == n) + return 0; + + gcc_assert (vec[i] > 0); + + /* We ignore the parts of the distance vector in subloops, since usually + the numbers of iterations are much smaller. */ + return loop_sizes[i] * vec[i]; +} + +/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE + at the position corresponding to the loop of the step. N is the depth + of the considered loop nest, and, LOOP is its innermost loop. */ + +static void +add_subscript_strides (tree access_fn, unsigned stride, + HOST_WIDE_INT *strides, unsigned n, struct loop *loop) +{ + struct loop *aloop; + tree step; + HOST_WIDE_INT astep; + unsigned min_depth = loop_depth (loop) - n; + + while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC) + { + aloop = get_chrec_loop (access_fn); + step = CHREC_RIGHT (access_fn); + access_fn = CHREC_LEFT (access_fn); + + if ((unsigned) loop_depth (aloop) <= min_depth) + continue; + + if (host_integerp (step, 0)) + astep = tree_low_cst (step, 0); + else + astep = L1_CACHE_LINE_SIZE; + + strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride; + + } +} + +/* Returns the volume of memory references accessed between two consecutive + self-reuses of the reference DR. We consider the subscripts of DR in N + loops, and LOOP_SIZES contains the volumes of accesses in each of the + loops. LOOP is the innermost loop of the current loop nest. */ + +static unsigned +self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n, + struct loop *loop) +{ + tree stride, access_fn; + HOST_WIDE_INT *strides, astride; + VEC (tree, heap) *access_fns; + tree ref = DR_REF (dr); + unsigned i, ret = ~0u; + + /* In the following example: + + for (i = 0; i < N; i++) + for (j = 0; j < N; j++) + use (a[j][i]); + the same cache line is accessed each N steps (except if the change from + i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse, + we cannot rely purely on the results of the data dependence analysis. + + Instead, we compute the stride of the reference in each loop, and consider + the innermost loop in that the stride is less than cache size. */ + + strides = XCNEWVEC (HOST_WIDE_INT, n); + access_fns = DR_ACCESS_FNS (dr); + + for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++) + { + /* Keep track of the reference corresponding to the subscript, so that we + know its stride. */ + while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF) + ref = TREE_OPERAND (ref, 0); + + if (TREE_CODE (ref) == ARRAY_REF) + { + stride = TYPE_SIZE_UNIT (TREE_TYPE (ref)); + if (host_integerp (stride, 1)) + astride = tree_low_cst (stride, 1); + else + astride = L1_CACHE_LINE_SIZE; + + ref = TREE_OPERAND (ref, 0); + } + else + astride = 1; + + add_subscript_strides (access_fn, astride, strides, n, loop); + } + + for (i = n; i-- > 0; ) + { + unsigned HOST_WIDE_INT s; + + s = strides[i] < 0 ? -strides[i] : strides[i]; + + if (s < (unsigned) L1_CACHE_LINE_SIZE + && (loop_sizes[i] + > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION))) + { + ret = loop_sizes[i]; + break; + } + } + + free (strides); + return ret; +} + +/* Determines the distance till the first reuse of each reference in REFS + in the loop nest of LOOP. */ + +static void +determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs) +{ + struct loop *nest, *aloop; + VEC (data_reference_p, heap) *datarefs = NULL; + VEC (ddr_p, heap) *dependences = NULL; + struct mem_ref_group *gr; + struct mem_ref *ref; + VEC (loop_p, heap) *vloops = NULL; + unsigned *loop_data_size; + unsigned i, j, n; + unsigned volume, dist, adist; + HOST_WIDE_INT vol; + data_reference_p dr; + ddr_p dep; + + if (loop->inner) + return; + + /* Find the outermost loop of the loop nest of loop (we require that + there are no sibling loops inside the nest). */ + nest = loop; + while (1) + { + aloop = loop_outer (nest); + + if (aloop == current_loops->tree_root + || aloop->inner->next) + break; + + nest = aloop; + } + + /* For each loop, determine the amount of data accessed in each iteration. + We use this to estimate whether the reference is evicted from the + cache before its reuse. */ + find_loop_nest (nest, &vloops); + n = VEC_length (loop_p, vloops); + loop_data_size = XNEWVEC (unsigned, n); + volume = volume_of_references (refs); + i = n; + while (i-- != 0) + { + loop_data_size[i] = volume; + /* Bound the volume by the L2 cache size, since above this bound, + all dependence distances are equivalent. */ + if (volume > L2_CACHE_SIZE_BYTES) + continue; + + aloop = VEC_index (loop_p, vloops, i); + vol = estimated_loop_iterations_int (aloop, false); + if (vol < 0) + vol = expected_loop_iterations (aloop); + volume *= vol; + } + + /* Prepare the references in the form suitable for data dependence + analysis. We ignore unanalysable data references (the results + are used just as a heuristics to estimate temporality of the + references, hence we do not need to worry about correctness). */ + for (gr = refs; gr; gr = gr->next) + for (ref = gr->refs; ref; ref = ref->next) + { + dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p); + + if (dr) + { + ref->reuse_distance = volume; + dr->aux = ref; + VEC_safe_push (data_reference_p, heap, datarefs, dr); + } + } + + for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) + { + dist = self_reuse_distance (dr, loop_data_size, n, loop); + ref = dr->aux; + if (ref->reuse_distance > dist) + ref->reuse_distance = dist; + } + + compute_all_dependences (datarefs, &dependences, vloops, true); + + for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++) + { + if (DDR_ARE_DEPENDENT (dep) == chrec_known) + continue; + + if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know + || DDR_NUM_DIST_VECTS (dep) == 0) + { + /* If the dependence cannot be analysed, assume that there might be + a reuse. */ + dist = 0; + } + else + { + /* The distance vectors are normalised to be always lexicographically + positive, hence we cannot tell just from them whether DDR_A comes + before DDR_B or vice versa. However, it is not important, + anyway -- if DDR_A is close to DDR_B, then it is either reused in + DDR_B (and it is not nontemporal), or it reuses the value of DDR_B + in cache (and marking it as nontemporal would not affect + anything). */ + + dist = volume; + for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++) + { + adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j), + loop_data_size, n); + + /* Ignore accesses closer than + L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION, + so that we use nontemporal prefetches e.g. if single memory + location is accessed several times in a single iteration of + the loop. */ + if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION) + continue; + + if (adist < dist) + dist = adist; + } + } + + ref = DDR_A (dep)->aux; + if (ref->reuse_distance > dist) + ref->reuse_distance = dist; + ref = DDR_B (dep)->aux; + if (ref->reuse_distance > dist) + ref->reuse_distance = dist; + } + + free_dependence_relations (dependences); + free_data_refs (datarefs); + free (loop_data_size); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Reuse distances:\n"); + for (gr = refs; gr; gr = gr->next) + for (ref = gr->refs; ref; ref = ref->next) + fprintf (dump_file, " ref %p distance %u\n", + (void *) ref, ref->reuse_distance); + } +} + /* Issue prefetch instructions for array references in LOOP. Returns true if the LOOP was unrolled. */ @@ -963,6 +1291,8 @@ loop_prefetch_arrays (struct loop *loop) if (!anything_to_prefetch_p (refs)) goto fail; + determine_loop_nest_reuse (loop, refs); + /* Step 3: determine the ahead and unroll factor. */ /* FIXME: the time should be weighted by the probabilities of the blocks in @@ -1034,10 +1364,11 @@ tree_ssa_prefetch_arrays (void) fprintf (dump_file, " simultaneous prefetches: %d\n", SIMULTANEOUS_PREFETCHES); fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY); - fprintf (dump_file, " L1 cache size: %d (%d bytes)\n", - L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE); - fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK); + fprintf (dump_file, " L1 cache size: %d lines, %d bytes\n", + L1_CACHE_SIZE, L1_CACHE_SIZE_BYTES); + fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE); + fprintf (dump_file, " L2 cache size: %d bytes\n", L2_CACHE_SIZE_BYTES); fprintf (dump_file, "\n"); } diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index 6636cfd2758..647d98f82a6 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -1128,7 +1128,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr) fprintf (vect_dump, "vect_compute_data_ref_alignment:"); /* Initialize misalignment to unknown. */ - DR_MISALIGNMENT (dr) = -1; + SET_DR_MISALIGNMENT (dr, -1); misalign = DR_INIT (dr); aligned_to = DR_ALIGNED_TO (dr); @@ -1198,7 +1198,7 @@ vect_compute_data_ref_alignment (struct data_reference *dr) return false; } - DR_MISALIGNMENT (dr) = TREE_INT_CST_LOW (misalign); + SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign)); if (vect_print_dump_info (REPORT_DETAILS)) { @@ -1267,21 +1267,23 @@ vect_update_misalignment_for_peel (struct data_reference *dr, continue; gcc_assert (DR_MISALIGNMENT (dr) / dr_size == DR_MISALIGNMENT (dr_peel) / dr_peel_size); - DR_MISALIGNMENT (dr) = 0; + SET_DR_MISALIGNMENT (dr, 0); return; } if (known_alignment_for_access_p (dr) && known_alignment_for_access_p (dr_peel)) { - DR_MISALIGNMENT (dr) += npeel * dr_size; - DR_MISALIGNMENT (dr) %= UNITS_PER_SIMD_WORD; + int misal = DR_MISALIGNMENT (dr); + misal += npeel * dr_size; + misal %= UNITS_PER_SIMD_WORD; + SET_DR_MISALIGNMENT (dr, misal); return; } if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "Setting misalignment to -1."); - DR_MISALIGNMENT (dr) = -1; + SET_DR_MISALIGNMENT (dr, -1); } @@ -1577,7 +1579,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) save_misalignment = DR_MISALIGNMENT (dr); vect_update_misalignment_for_peel (dr, dr0, npeel); supportable_dr_alignment = vect_supportable_dr_alignment (dr); - DR_MISALIGNMENT (dr) = save_misalignment; + SET_DR_MISALIGNMENT (dr, save_misalignment); if (!supportable_dr_alignment) { @@ -1601,7 +1603,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0; LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0); - DR_MISALIGNMENT (dr0) = 0; + SET_DR_MISALIGNMENT (dr0, 0); if (vect_print_dump_info (REPORT_ALIGNMENT)) fprintf (vect_dump, "Alignment of access forced using peeling."); @@ -1702,7 +1704,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) { stmt_vec_info stmt_info = vinfo_for_stmt (stmt); dr = STMT_VINFO_DATA_REF (stmt_info); - DR_MISALIGNMENT (dr) = 0; + SET_DR_MISALIGNMENT (dr, 0); if (vect_print_dump_info (REPORT_ALIGNMENT)) fprintf (vect_dump, "Alignment of access forced using versioning."); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index dd8e2b8ad56..6e20652d7ba 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -339,7 +339,8 @@ is_pattern_stmt_p (stmt_vec_info stmt_info) /* Reflects actual alignment of first access in the vectorized loop, taking into account peeling/versioning if applied. */ -#define DR_MISALIGNMENT(DR) (DR)->aux +#define DR_MISALIGNMENT(DR) ((int) (size_t) (DR)->aux) +#define SET_DR_MISALIGNMENT(DR, VAL) ((DR)->aux = (void *) (size_t) (VAL)) static inline bool aligned_access_p (struct data_reference *data_ref_info)